####   Загрузка данных

In [216]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler

In [217]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

In [218]:
train_df = pd.read_csv("data/train.csv", index_col = "Id")
test_df = pd.read_csv('data/test.csv', index_col = "Id")

Очистим выборку от выбросов по наиболее значимому признаку - 'GrLivArea' (Жилая площадь)

In [219]:
train_df = train_df[train_df['GrLivArea'] <= 4000]
test_df = test_df

Приведём пропуски данных к единому виду

In [220]:
for c in train_df.columns:
    train_df[c] = train_df[c].apply(lambda x: None if x is np.nan else x)
for c in test_df.columns:   
    test_df[c] = test_df[c].apply(lambda x: None if x is np.nan else x)

Объявим функции преобразуюущие данные

In [221]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

def factorize(df, factor_df, column, fill_na=None):
    factor_df[column] = df[column].apply(str).fillna("None")
    if fill_na is not None:
        factor_df[column].fillna(fill_na, inplace=True)
    le.fit(factor_df[column].unique())
    factor_df[column] = le.transform(factor_df[column])
    return factor_df


def onehot(onehot_df, df, column_name, fill_na, drop_name):
    onehot_df[column_name] = df[column_name]
    if fill_na is not None:
        onehot_df[column_name].fillna(fill_na, inplace=True)

    dummies = pd.get_dummies(onehot_df[column_name], prefix="_" + column_name)
    
    # Dropping one of the columns actually made the results slightly worse.
    # if drop_name is not None:
    #     dummies.drop(["_" + column_name + "_" + drop_name], axis=1, inplace=True)

    onehot_df = onehot_df.join(dummies)
    onehot_df = onehot_df.drop([column_name], axis=1)
    return onehot_df

In [222]:
train_df["MSSubClass"]

Id
1       60
2       20
3       60
4       70
5       60
        ..
1456    60
1457    20
1458    70
1459    20
1460    20
Name: MSSubClass, Length: 1456, dtype: int64

In [223]:
test_df["MSZoning"]

Id
1461    RH
1462    RL
1463    RL
1464    RL
1465    RL
        ..
2915    RM
2916    RM
2917    RL
2918    RL
2919    RL
Name: MSZoning, Length: 1459, dtype: object

In [224]:
def generate_features(df):
    
    all_df = pd.DataFrame(index = df.index)  
    all_df["LotArea"] = df["LotArea"]

    all_df["MasVnrArea"] = df["MasVnrArea"].fillna(0)
    all_df["BsmtFinSF1"] = df["BsmtFinSF1"].fillna(0)
    all_df["BsmtFinSF2"] = df["BsmtFinSF2"].fillna(0)
    all_df["BsmtUnfSF"] = df["BsmtUnfSF"].fillna(0)
    all_df["TotalBsmtSF"] = df["TotalBsmtSF"].fillna(0)

    all_df["1stFlrSF"] = df["1stFlrSF"].fillna(0)
    all_df["2ndFlrSF"] = df["2ndFlrSF"].fillna(0)
    all_df["GrLivArea"] = df["GrLivArea"].fillna(0)
    all_df["GarageArea"] = df["GarageArea"].fillna(0)

    all_df["WoodDeckSF"] = df["WoodDeckSF"]
    all_df["OpenPorchSF"] = df["OpenPorchSF"]
    all_df["EnclosedPorch"] = df["EnclosedPorch"]
    all_df["3SsnPorch"] = df["3SsnPorch"]
    all_df["ScreenPorch"] = df["ScreenPorch"]
    
    all_df["BsmtFullBath"] = df["BsmtFullBath"].fillna(0)
    all_df["BsmtHalfBath"] = df["BsmtHalfBath"].fillna(0)
    all_df["FullBath"] = df["FullBath"] 
    all_df["HalfBath"] = df["HalfBath"] 
    all_df["BedroomAbvGr"] = df["BedroomAbvGr"] 
    all_df["KitchenAbvGr"] = df["KitchenAbvGr"] 
    all_df["TotRmsAbvGrd"] = df["TotRmsAbvGrd"] 
    all_df["Fireplaces"] = df["Fireplaces"] 

    all_df["GarageCars"] = df["GarageCars"].fillna(0)
    all_df["CentralAir"] = (df["CentralAir"] == "Y") * 1.0
    all_df["PoolArea"] = df["PoolArea"].fillna(0)
    all_df["OverallQual"] = df["OverallQual"]
    all_df["OverallCond"] = df["OverallCond"]
    all_df["LotFrontage"] = df["LotFrontage"]   
    
    lot_frontage_by_neighborhood = train_df["LotFrontage"].groupby(train_df["Neighborhood"])
    for key, group in lot_frontage_by_neighborhood:
        idx = (df["Neighborhood"] == key) & (df["LotFrontage"].isnull())
        all_df.loc[idx, "LotFrontage"] = group.median()
    
    
    # Label_Encoder()
    all_df = factorize(df, all_df, "MSSubClass")
    all_df = factorize(df, all_df, "MSZoning", "RL")
    all_df = factorize(df, all_df, "LotConfig")
    all_df = factorize(df, all_df, "Neighborhood")
    all_df = factorize(df, all_df, "Condition1")
    all_df = factorize(df, all_df, "BldgType")
    all_df = factorize(df, all_df, "HouseStyle")
    all_df = factorize(df, all_df, "RoofStyle")
    all_df = factorize(df, all_df, "Exterior1st", "Other")
    all_df = factorize(df, all_df, "Exterior2nd", "Other")
    all_df = factorize(df, all_df, "MasVnrType", "None")
    all_df = factorize(df, all_df, "Foundation")
    all_df = factorize(df, all_df, "SaleType", "Oth")
    all_df = factorize(df, all_df, "SaleCondition")
    
    
    d =  {None : 0, 'A' : 1, 'C' : 2, 'FV' : 3, 'I': 4, 'RH' : 5,
        'RL' : 6, "RP" : 7, 'RM' : 8}
         
    all_df["MSZoning"] = df["MSZoning"].map(d).fillna(0)
        
    qual_dict = {None: 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
    all_df["ExterQual"] = df["ExterQual"].map(qual_dict).astype(int)
    all_df["ExterCond"] = df["ExterCond"].map(qual_dict).astype(int)
    all_df["BsmtQual"] = df["BsmtQual"].map(qual_dict).astype(int)
    all_df["BsmtCond"] = df["BsmtCond"].map(qual_dict).astype(int)
    all_df["HeatingQC"] = df["HeatingQC"].map(qual_dict).astype(int)
    all_df["KitchenQual"] = df["KitchenQual"].map(qual_dict).astype(int)
    all_df["FireplaceQu"] = df["FireplaceQu"].map(qual_dict).astype(int)
    all_df["GarageQual"] = df["GarageQual"].map(qual_dict).astype(int)
    all_df["GarageCond"] = df["GarageCond"].map(qual_dict).astype(int)  
    all_df["PoolQC"] = df["PoolQC"].map(qual_dict).astype(int)
    
    
    all_df["BsmtExposure"] = df["BsmtExposure"].map(
        {None: 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4}).astype(int)

    bsmt_fin_dict = {None: 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
    
    all_df["BsmtFinType1"] = df["BsmtFinType1"].map(bsmt_fin_dict).astype(int)
    all_df["BsmtFinType2"] = df["BsmtFinType2"].map(bsmt_fin_dict).astype(int)

    all_df["Functional"] = df["Functional"].map(
        {None: 0, "Sal": 1, "Sev": 2, "Maj2": 3, "Maj1": 4, 
         "Mod": 5, "Min2": 6, "Min1": 7, "Typ": 8}).astype(int)

    all_df["GarageFinish"] = df["GarageFinish"].map(
        {None: 0, "Unf": 1, "RFn": 2, "Fin": 3}).astype(int)

    all_df["Fence"] = df["Fence"].map(
        {None: 0, "MnWw": 1, "GdWo": 2, "MnPrv": 3, "GdPrv": 4}).astype(int)

    all_df["YearBuilt"] = df["YearBuilt"]
    all_df["YearRemodAdd"] = df["YearRemodAdd"]

    all_df["GarageYrBlt"] = df["GarageYrBlt"].fillna(0.0)

    all_df["MoSold"] = df["MoSold"]
    all_df["YrSold"] = df["YrSold"]
    all_df["LowQualFinSF"] = df["LowQualFinSF"]
    all_df["MiscVal"] = df["MiscVal"]


    
    # Теперь сгенерируем ещё различные признаки
    # _______________________________________________________________________________
    
    neighborhood_map = {
        'MeadowV' : 0,          #88000
        'IDOTRR' : 1,           #103000
        'BrDale' : 1,           #106000
        'OldTown' : 1,          #119000
        'Edwards' : 1,          #121750
        'BrkSide' : 1,          #124300
        'Sawyer' : 1,           #135000
        'Blueste' : 1,          #137500
        'SWISU' : 2,            #139500
        'NAmes' : 2,            #140000
        'NPkVill' : 2,          #146000
        'Mitchel' : 2,          #153500
        'SawyerW' : 2,          #179900
        'Gilbert' : 2,          #181000
        'NWAmes' : 2,           #182900
        'Blmngtn' : 2,          #191000
        'CollgCr' : 2,          #197200
        'ClearCr' : 3,          #200250
        'Crawfor' : 3,          #200624
        'Veenker' : 3,          #218000
        'Somerst' : 3,          #225500
        'Timber' : 3,           #228475
        'StoneBr' : 4,          #278000
        'NoRidge' : 4,          #301500
        'NridgHt' : 4          #315000
    }

    all_df["NeighborhoodBin"] = df["Neighborhood"].map(neighborhood_map).fillna(0)
    
    all_df["HighSeason"] = df["MoSold"].replace( 
        {1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0})

    all_df["NewerDwelling"] = df["MSSubClass"].replace(
        {20: 1, 30: 0, 40: 0, 45: 0,50: 0, 60: 1, 70: 0, 75: 0, 80: 0, 85: 0,
         90: 0, 120: 1, 150: 0, 160: 0, 180: 0, 190: 0})  
        
    good_neighborhood = ['NridgHt', 'Crawfor', 'StoneBr', 'Somerst', 'NoRidge', 'Timber', 'Veenker']
    all_df["Neighborhood_Good"] = df['Neighborhood'].apply(lambda x: 1 if x in good_neighborhood else 0)

    all_df["SaleCondition_PriceDown"] = df.SaleCondition.replace(
        {'Abnorml': 1, 'Alloca': 1, 'AdjLand': 1, 'Family': 1, 'Normal': 0, 'Partial': 0})

    # Закончено ли строительство дома перед продажей
    
    all_df["BoughtOffPlan"] = df.SaleCondition.replace(
        {"Abnorml" : 0, "Alloca" : 0, "AdjLand" : 0, "Family" : 0, "Normal" : 0, "Partial" : 1})
    
    # Отопление
    all_df["BadHeating"] = df.HeatingQC.replace(
        {'Ex': 0, 'Gd': 0, 'TA': 0, 'Fa': 1, 'Po': 1})

    area_cols = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 
                 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'LowQualFinSF', 'PoolArea' ]
    
    all_df["TotalArea"] = df[area_cols].sum(axis=1)
    all_df["TotalArea1st2nd"] = df["1stFlrSF"] + df["2ndFlrSF"]

    all_df["Age"] = 2010 - df["YearBuilt"]
    all_df["TimeSinceSold"] = 2010 - df["YrSold"]

    all_df["SeasonSold"] = df["MoSold"].map({12:0, 1:0, 2:0, 3:1, 4:1, 5:1, 
                                                  6:2, 7:2, 8:2, 9:3, 10:3, 11:3}).astype(int)
    
    all_df['TotalSF'] = df['TotalBsmtSF'].fillna(0) + df['1stFlrSF'].fillna(0) + df['2ndFlrSF'].fillna(0)

    all_df['Total_sqr_footage'] = (df['BsmtFinSF1'].fillna(0) + df['BsmtFinSF2'].fillna(0) +
                                     df['1stFlrSF'].fillna(0) + df['2ndFlrSF'].fillna(0))

    all_df['Total_Bathrooms'] = (df['FullBath'].fillna(0) + (0.5 * df['HalfBath'].fillna(0) ) +
                                   df['BsmtFullBath'].fillna(0)  + (0.5 * df['BsmtHalfBath'].fillna(0) ))

    all_df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +
                                  df['EnclosedPorch'] + df['ScreenPorch'] +
                                  df['WoodDeckSF'])
    
    
    all_df["YearsSinceRemodel"] = df["YrSold"] - df["YearRemodAdd"]
    all_df["Has2ndFloor"] = (df["2ndFlrSF"] == 0) * 1
    all_df["HasMasVnr"] = (df["MasVnrArea"] == 0) * 1
    all_df["HasWoodDeck"] = (df["WoodDeckSF"] == 0) * 1
    all_df["HasOpenPorch"] = (df["OpenPorchSF"] == 0) * 1
    all_df["HasEnclosedPorch"] = (df["EnclosedPorch"] == 0) * 1
    all_df["Has3SsnPorch"] = (df["3SsnPorch"] == 0) * 1
    all_df["HasScreenPorch"] = (df["ScreenPorch"] == 0) * 1
    all_df['haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    all_df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    all_df['hasgarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    all_df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    all_df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    
    
    return all_df

In [225]:
train = generate_features(train_df)
test = generate_features(test_df)

In [226]:
test_df["MSZoning"]

Id
1461    RH
1462    RL
1463    RL
1464    RL
1465    RL
        ..
2915    RM
2916    RM
2917    RL
2918    RL
2919    RL
Name: MSZoning, Length: 1459, dtype: object

In [227]:
train.shape

(1456, 95)

In [228]:
test.shape

(1459, 95)

Проведём массштабирование признаков.

In [229]:
neighborhood_bin_train = pd.DataFrame(index = train.index)
neighborhood_bin_train["NeighborhoodBin"] = train["NeighborhoodBin"]
neighborhood_bin_test = pd.DataFrame(index = test.index)
neighborhood_bin_test["NeighborhoodBin"] = test["NeighborhoodBin"]

Вычимлим примерную ассиметрию данных для каждого признака. Заметно ассиметричные признаки прологорифмируем

In [230]:
numeric_features = train.dtypes[train.dtypes != "object"].index

Бинарные признаки не нуждаются в масштабировании. Очистим от них численные данные.

In [231]:
binary_columns = [c for c in numeric_features if train[c].nunique() == 2]
numeric_features = [c for c in numeric_features if train[c].nunique() > 2]

In [232]:
len(binary_columns) + len(numeric_features) == train.shape[1]

True

In [233]:
skewed = train[numeric_features].apply(lambda x: skew(x.dropna().astype(float)))

In [234]:
skewed = skewed[abs(skewed) > 0.75]
skewed = skewed.index

In [235]:
train[skewed] = np.log1p(train[skewed])
test[skewed] = np.log1p(test[skewed])

Теперь, после того как мы сгладили ассиметрию в распределении применим StandartScaller

In [236]:
scaler = StandardScaler()
scaler.fit(train[numeric_features])

scaled = scaler.transform(train[numeric_features])
for i, col in enumerate(numeric_features):
    train[col] = scaled[:, i]

scaled = scaler.transform(test[numeric_features])
for i, col in enumerate(numeric_features):
    test[col] = scaled[:, i]

One-hot encoding 

In [237]:
def generate_one_hot(df):
    onehot_df = pd.DataFrame(index = df.index)

    onehot_df = onehot(onehot_df, df, "MSSubClass", None, "40")
    onehot_df = onehot(onehot_df, df, "MSZoning", "RL", "RH")
    onehot_df = onehot(onehot_df, df, "LotConfig", None, "FR3")
    onehot_df = onehot(onehot_df, df, "Neighborhood", None, "OldTown")
    onehot_df = onehot(onehot_df, df, "Condition1", None, "RRNe")
    onehot_df = onehot(onehot_df, df, "BldgType", None, "2fmCon")
    onehot_df = onehot(onehot_df, df, "HouseStyle", None, "1.5Unf")
    onehot_df = onehot(onehot_df, df, "RoofStyle", None, "Shed")
    onehot_df = onehot(onehot_df, df, "Exterior1st", "VinylSd", "CBlock")
    onehot_df = onehot(onehot_df, df, "Exterior2nd", "VinylSd", "CBlock")
    onehot_df = onehot(onehot_df, df, "Foundation", None, "Wood")
    onehot_df = onehot(onehot_df, df, "SaleType", "WD", "Oth")
    onehot_df = onehot(onehot_df, df, "SaleCondition", "Normal", "AdjLand")

 
    temp_df = df[["MasVnrType", "MasVnrArea"]].copy()
    idx = (df["MasVnrArea"] != 0) & ((df["MasVnrType"] == "None") | (df["MasVnrType"].isnull()))
    temp_df.loc[idx, "MasVnrType"] = "BrkFace"
    onehot_df = onehot(onehot_df, temp_df, "MasVnrType", "None", "BrkCmn")

  
    onehot_df = onehot(onehot_df, df, "LotShape", None, "IR3")
    onehot_df = onehot(onehot_df, df, "LandContour", None, "Low")
    onehot_df = onehot(onehot_df, df, "LandSlope", None, "Sev")
    onehot_df = onehot(onehot_df, df, "Electrical", "SBrkr", "FuseP")
    onehot_df = onehot(onehot_df, df, "GarageType", "None", "CarPort")
    onehot_df = onehot(onehot_df, df, "PavedDrive", None, "P")
    onehot_df = onehot(onehot_df, df, "MiscFeature", "None", "Othr")

 
    onehot_df = onehot(onehot_df, df, "Street", None, "Grvl")
    onehot_df = onehot(onehot_df, df, "Alley", "None", "Grvl")
    onehot_df = onehot(onehot_df, df, "Condition2", None, "PosA")
    onehot_df = onehot(onehot_df, df, "RoofMatl", None, "WdShake")
    onehot_df = onehot(onehot_df, df, "Heating", None, "Wall")


    onehot_df = onehot(onehot_df, df, "ExterQual", "None", "Ex")
    onehot_df = onehot(onehot_df, df, "ExterCond", "None", "Ex")
    onehot_df = onehot(onehot_df, df, "BsmtQual", "None", "Ex")
    onehot_df = onehot(onehot_df, df, "BsmtCond", "None", "Ex")
    onehot_df = onehot(onehot_df, df, "HeatingQC", "None", "Ex")
    onehot_df = onehot(onehot_df, df, "KitchenQual", "TA", "Ex")
    onehot_df = onehot(onehot_df, df, "FireplaceQu", "None", "Ex")
    onehot_df = onehot(onehot_df, df, "GarageQual", "None", "Ex")
    onehot_df = onehot(onehot_df, df, "GarageCond", "None", "Ex")
    onehot_df = onehot(onehot_df, df, "PoolQC", "None", "Ex")
    onehot_df = onehot(onehot_df, df, "BsmtExposure", "None", "Gd")
    onehot_df = onehot(onehot_df, df, "BsmtFinType1", "None", "GLQ")
    onehot_df = onehot(onehot_df, df, "BsmtFinType2", "None", "GLQ")
    onehot_df = onehot(onehot_df, df, "Functional", "Typ", "Typ")
    onehot_df = onehot(onehot_df, df, "GarageFinish", "None", "Fin")
    onehot_df = onehot(onehot_df, df, "Fence", "None", "MnPrv")
    onehot_df = onehot(onehot_df, df, "MoSold", None, None)
    
    # Разобьём время с 1871 года по 2010 на отрезки по 20 лет
    year_map = pd.concat(pd.Series("YearBin" + str(i+1), index=range(1871+i*20,1891+i*20)) for i in range(0, 7))

    yearbin_df = pd.DataFrame(index = df.index)
    yearbin_df["GarageYrBltBin"] = df.GarageYrBlt.map(year_map)
    yearbin_df["GarageYrBltBin"].fillna("NoGarage", inplace=True)

    yearbin_df["YearBuiltBin"] = df.YearBuilt.map(year_map)
    yearbin_df["YearRemodAddBin"] = df.YearRemodAdd.map(year_map)
    
    onehot_df = onehot(onehot_df, yearbin_df, "GarageYrBltBin", None, None)
    onehot_df = onehot(onehot_df, yearbin_df, "YearBuiltBin", None, None)
    onehot_df = onehot(onehot_df, yearbin_df, "YearRemodAddBin", None, None)

    return onehot_df

In [238]:
train_one_hot = generate_one_hot(train_df)
test_one_hot = generate_one_hot(test_df)

In [239]:
train_one_hot.shape

(1456, 306)

In [240]:
test_one_hot.shape

(1459, 291)

Значения некоторых признаков присутствуют только в тренировочной выборке

In [241]:
drop_cols = [c for c in train_one_hot.columns if c not in test_one_hot.columns]

In [242]:
drop_cols

['_HouseStyle_2.5Fin',
 '_Exterior1st_ImStucc',
 '_Exterior1st_Stone',
 '_Exterior2nd_Other',
 '_Electrical_Mix',
 '_MiscFeature_TenC',
 '_Condition2_RRAe',
 '_Condition2_RRAn',
 '_Condition2_RRNn',
 '_RoofMatl_Membran',
 '_RoofMatl_Metal',
 '_RoofMatl_Roll',
 '_Heating_Floor',
 '_Heating_OthW',
 '_GarageQual_Ex',
 '_PoolQC_Fa']

In [243]:
train_one_hot.drop(drop_cols, axis=1, inplace=True)

Проверим себя

In [244]:
[c for c in test_one_hot.columns if c not in train_one_hot.columns]

['_MSSubClass_150']

In [245]:
test_one_hot.drop('_MSSubClass_150', axis=1, inplace=True)

In [246]:
train_one_hot.shape

(1456, 290)

In [247]:
test_one_hot.shape

(1459, 290)

In [248]:
train_one_hot \
.sum() \
.sort_values()[:20]

_Exterior2nd_CBlock      1
_Condition2_PosN         1
_Functional_Sev          1
_Condition2_PosA         1
_ExterCond_Po            1
_HeatingQC_Po            1
_Exterior1st_AsphShn     1
_PoolQC_Ex               1
_Exterior1st_CBlock      1
_Exterior1st_BrkComm     2
_Condition1_RRNe         2
_Condition2_Artery       2
_GarageCond_Ex           2
_MiscFeature_Othr        2
_Neighborhood_Blueste    2
_PoolQC_Gd               2
_RoofStyle_Shed          2
_SaleType_Con            2
_BsmtCond_Po             2
_MiscFeature_Gar2        2
dtype: int64

Отфильтруем признаки с низкой информативностью

In [249]:
mask = (train_one_hot.sum().sort_values() < 9)
drop_cols = train_one_hot \
.sum() \
.sort_values()[mask].index

In [250]:
train_one_hot.drop(drop_cols, axis = 1, inplace = True)
test_one_hot.drop(drop_cols, axis = 1, inplace = True)

In [251]:
(test_one_hot.shape[0]-test_one_hot.sum()).sort_values()

_PoolQC_None               3
_Street_Pave               6
_Heating_GasA             13
_Condition2_Norm          15
_RoofMatl_CompShg         17
                        ... 
_Exterior2nd_ImStucc    1454
_HouseStyle_1.5Unf      1454
_Functional_Maj1        1454
_LandSlope_Sev          1456
_Fence_MnWw             1458
Length: 242, dtype: int64

In [252]:
train = train.join(train_one_hot)
test = test.join(test_one_hot)

In [253]:
label_log = pd.DataFrame(index = train_df.index, columns=["SalePrice"])
label_log["SalePrice"] = np.log(train_df["SalePrice"])

In [254]:
label = pd.DataFrame(index = train_df.index, columns=["SalePrice"])
label["SalePrice"] = train_df["SalePrice"]

In [255]:
train.to_parquet('data/train.parquet')
test.to_parquet('data/test.parquet')
label.to_parquet('data/label.parquet')

In [256]:
train.shape

(1456, 337)