In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import shap

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import StackingRegressor
from scipy import stats
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import time
from sklearn.feature_selection import RFE



from xgboost import XGBRegressor

In [None]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
train_df = train.copy()
test_df = test.copy()
all_data = pd.concat([train.copy(), test.copy()], ignore_index=True)

# Feature Engineering

### Manual Features

In [None]:
train_df = train_df.drop(["Street", "Utilities", "Id"], axis=1)
test_df = test_df.drop(["Street", "Utilities", "Id"], axis=1)

# train_df['LotFrontage'] = train_df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
# test_df['LotFrontage'] = test_df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# -3. Total Square Feet based off of Garage and Basement
train_df["TotalSqrtFeet"] = train_df["GrLivArea"] + train_df["TotalBsmtSF"]
test_df["TotalSqrtFeet"] = test_df["GrLivArea"] + test_df["TotalBsmtSF"]

# -2. Total Baths
train_df["TotalBaths"] = train_df["BsmtFullBath"] + (train_df["BsmtHalfBath"]  * .5) + train_df["FullBath"] + (train_df["HalfBath"]* .5)
test_df["TotalBaths"] = test_df["BsmtFullBath"] + (test_df["BsmtHalfBath"]  * .5) + test_df["FullBath"] + (test_df["HalfBath"]* .5)

# -1. Age of house, since remodel, and if it was remodeled
binary_flags = {
    'Isgarage': 'GarageArea',
    'Isfireplace': 'Fireplaces',
    'Ispool': 'PoolArea',
    'Issecondfloor': '2ndFlrSF',
    'IsOpenPorch': 'OpenPorchSF',
    'IsWoodDeck': 'WoodDeckSF'
}

for new_col, base_col in binary_flags.items():
    train_df[new_col] = (train_df[base_col] > 0).astype(int)
    test_df[new_col] = (test_df[base_col] > 0).astype(int)

# 1. Age of house, since remodel, and if it was remodeled
train_df["HouseAge"] = train_df["YrSold"] - train_df["YearBuilt"]
test_df["HouseAge"] = test_df["YrSold"] - test_df["YearBuilt"]

train_df["SinceRemodel"] = train_df["YrSold"] - train_df["YearRemodAdd"]
test_df["SinceRemodel"] = test_df["YrSold"] - test_df["YearRemodAdd"]

train_df["IsRemodeled"] = (train_df["YearBuilt"] != train_df["YearRemodAdd"]).astype(int)
test_df["IsRemodeled"] = (test_df["YearBuilt"] != test_df["YearRemodAdd"]).astype(int)

# 2. Total porch area
train_df["TotalPorchSF"] = (
    train_df["OpenPorchSF"] + train_df["EnclosedPorch"] +
    train_df["3SsnPorch"] + train_df["ScreenPorch"]
)
test_df["TotalPorchSF"] = (
    test_df["OpenPorchSF"] + test_df["EnclosedPorch"] +
    test_df["3SsnPorch"] + test_df["ScreenPorch"]
)

# 3. Bed/Bath ratio
train_df["BedBathRatio"] = train_df["BedroomAbvGr"] / train_df["TotalBaths"]
test_df["BedBathRatio"] = test_df["BedroomAbvGr"] / test_df["TotalBaths"]

# # 4. Garage score = area * car capacity
# train_df["GarageScore"] = train_df["GarageArea"] * train_df["GarageCars"]
# test_df["GarageScore"] = test_df["GarageArea"] * test_df["GarageCars"]

# 5. Total finished basement
train_df["TotalFinishedBsmt"] = train_df["BsmtFinSF1"] + train_df["BsmtFinSF2"]
test_df["TotalFinishedBsmt"] = test_df["BsmtFinSF1"] + test_df["BsmtFinSF2"]

# 6. Has basement
train_df["HasBasement"] = (train_df["TotalBsmtSF"] > 0).astype(int)
test_df["HasBasement"] = (test_df["TotalBsmtSF"] > 0).astype(int)

# 7. Has masonry veneer
train_df["HasMasonry"] = (train_df["MasVnrArea"] > 0).astype(int)
test_df["HasMasonry"] = (test_df["MasVnrArea"] > 0).astype(int)

# 8. Has second garage stall
train_df["BigGarage"] = (train_df["GarageCars"] > 1).astype(int)
test_df["BigGarage"] = (test_df["GarageCars"] > 1).astype(int)

# 9. Year and month sold as categorical
train_df["YrSold_cat"] = train_df["YrSold"].astype(str)
train_df["MoSold_cat"] = train_df["MoSold"].astype(str)
test_df["YrSold_cat"] = test_df["YrSold"].astype(str)
test_df["MoSold_cat"] = test_df["MoSold"].astype(str)

# 10. Ratio of finished basement to total basement
train_df["BsmtFinRatio"] = train_df["TotalFinishedBsmt"] / train_df["TotalBsmtSF"].replace(0, 1)
test_df["BsmtFinRatio"] = test_df["TotalFinishedBsmt"] / test_df["TotalBsmtSF"].replace(0, 1)

# 11. Ratio of living area to lot size
train_df["LivingLotRatio"] = train_df["GrLivArea"] / train_df["LotArea"]
test_df["LivingLotRatio"] = test_df["GrLivArea"] / test_df["LotArea"]

# 12. Quality-related score: combines OverallQual and ExterQual
qual_map = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train_df["ExterQual_num"] = train_df["ExterQual"].map(qual_map)
test_df["ExterQual_num"] = test_df["ExterQual"].map(qual_map)

train_df["QualityScore"] = train_df["OverallQual"].astype(int) * train_df["ExterQual_num"]
test_df["QualityScore"] = test_df["OverallQual"].astype(int) * test_df["ExterQual_num"]

# # 13. Total home value per square foot
# train_df["PricePerSqFt"] = train_df["SalePrice"] / train_df["TotalSqrtFeet"]
# # We can't create this for test set as SalePrice is not available

# 14. Total rooms per living area
train_df["RoomsPerArea"] = train_df["TotRmsAbvGrd"] / train_df["GrLivArea"]
test_df["RoomsPerArea"] = test_df["TotRmsAbvGrd"] / test_df["GrLivArea"]

# 15. Simplified building type: 1Fam vs All Others
train_df["IsSingleFam"] = (train_df["BldgType"] == "1Fam").astype(int)
test_df["IsSingleFam"] = (test_df["BldgType"] == "1Fam").astype(int)

### Feature Transformation

In [None]:
numerical = train_df.select_dtypes(include=['int64', 'float64']).columns.drop('SalePrice')
categorical = train_df.select_dtypes(include=['object']).columns

train_df['SalePrice'] = np.log1p(train['SalePrice'])

num_imputer = SimpleImputer(strategy='mean')
num_imputer.fit(all_data(numerical))
train_df[numerical] = num_imputer.transform(train_df[numerical])
test_df[numerical] = num_imputer.transform(test_df[numerical])

scaler = StandardScaler()
train_df[numerical] = scaler.fit_transform(train_df[numerical])
test_df[numerical] = scaler.transform(test_df[numerical])

### Automated Features

In [None]:
safe_log = lambda x: np.log1p(x.clip(lower=0))  # prevents x < -1
safe_sqrt = lambda x: np.sqrt(x.clip(lower=0))

In [None]:
# Creates new arithmetic features based off of numeric columns
def make_combo_features(list_of_df, features_completed):
  fc = features_completed.copy()
  list_of_return_df = []
  for df in list_of_df:
    start_time = time.time()
    return_df = df.copy()
    features = list(df[numerical].columns)

    def make_combos_for_feature(main_feature, df):

      new_features = {}
      if fc:
        needed_doubles = [feature for feature in features if feature not in fc]
      else:
        needed_doubles = []

      for feature in features:
        if feature == main_feature:
          continue
        if feature not in fc:
          new_features[f"{main_feature} + {feature}"] = df[main_feature] + df[feature]
          new_features[f"{main_feature} - {feature}"] = df[main_feature] - df[feature]
          new_features[f"{main_feature} * {feature}"] = df[main_feature] * df[feature]
          new_features[f"{main_feature} / {feature}"] = df[main_feature] / (df[feature] + 0.1)
          print(f'Feature Made: {main_feature} + {feature}')
          print(f'Feature Made: {main_feature} - {feature}')
          print(f'Feature Made: {main_feature} * {feature}')
          print(f'Feature Made: {main_feature} / {feature}')
        elif main_feature in needed_doubles:
          new_features[f"{main_feature} + {feature}"] = df[main_feature] + df[feature]
          new_features[f"{main_feature} - {feature}"] = df[main_feature] - df[feature]
          new_features[f"{main_feature} * {feature}"] = df[main_feature] * df[feature]
          new_features[f"{main_feature} / {feature}"] = df[main_feature] / (df[feature] + 0.1)
          print(f'Feature Made: {main_feature} + {feature}')
          print(f'Feature Made: {main_feature} - {feature}')
          print(f'Feature Made: {main_feature} * {feature}')
          print(f'Feature Made: {main_feature} / {feature}')

      if main_feature in needed_doubles:
        needed_doubles.remove(main_feature)

      new_features[f"log1p {main_feature}"] = safe_log(df[main_feature])
      new_features[f"square {main_feature}"] = df[main_feature] ** 2
      new_features[f"sqrt {main_feature}"] = safe_sqrt(df[main_feature])
      print(f"Feature Made: log1p {main_feature}")
      print(f'Feature Made: square {main_feature}')
      print(f'Feature Made: {main_feature}')
      print('')
      fc.append(main_feature)
      new_df = pd.concat([df, pd.DataFrame(new_features, index=df.index)], axis=1)
      new_df.replace([np.inf, -np.inf], np.nan, inplace=True)

      return new_df

    for feature in numerical:
      print('Working on feature: ', feature)
      print('Time: ', time.time() - start_time)
      print('')
      return_df = make_combos_for_feature(feature, return_df)
      list_of_return_df.append(return_df)
  # features_completed.extend(features)
  return list_of_return_df, list(set(fc))

# df_list = make_combo_features([train_df, test_df])
# train_df, test_df = df_list[0], df_list[1]

In [None]:
def normalize_pair(pair):
    op1, op2 = pair
    # sort operands but keep the operator in the middle
    sorted_ops = sorted([op1, op2])
    return (sorted_ops[0], sorted_ops[1])

In [None]:
import time
import sys

def loading_bar(i, total):
  percent = (i / total) * 100
  bar = ('#' * int(percent // 2)).ljust(50)
  sys.stdout.write(f'\rLoading: [{bar}] {percent:.1f}%')
  sys.stdout.flush()
  time.sleep(0.05)
  print()



In [None]:
def combo(list_of_df, fp):
  n = np.sum([df.shape[1] for df in list_of_df])
  loading_total = n * n
  loading_i = 0
  list_of_return_df = []
  start_time = time.time()
  for df in list_of_df:
    features = df.select_dtypes(include=['int64', 'float64']).columns
    if fp == []:
      feature_pairs = fp
    else:
      feature_pairs = [normalize_pair((feature, 'None')) for feature in features]

    if 'SalePrice' in features:
      features = features.drop('SalePrice')

    new_features = {}
    for main_feature in features:
      for feature in features:
        loading_i += 1
        print(loading_i, ' / ', loading_total)
        loading_bar(loading_i, loading_total)
        print(f'Working on feature {feature} of {main_feature}')
        print('Time: ', time.time() - start_time)
        print('')
        if feature == main_feature:
          print('SKIP')
          continue
        pair = (main_feature, feature)
        if normalize_pair(pair) in feature_pairs:
          print('SKIP')
          continue
        else:
          feature_pairs.append(normalize_pair(pair))
          new_features[f"{main_feature} + {feature}"] = df.loc[:, main_feature] + df.loc[:, feature]
          new_features[f"{main_feature} - {feature}"] = df.loc[:, main_feature] - df.loc[:, feature]
          new_features[f"{main_feature} * {feature}"] = df.loc[:, main_feature] * df.loc[:, feature]
          new_features[f"{main_feature} / {feature}"] = df.loc[:, main_feature] / (df.loc[:, feature] + 0.1)
          print(f'Feature Made: {main_feature} + {feature}')
          print(f'Feature Made: {main_feature} - {feature}')
          print(f'Feature Made: {main_feature} * {feature}')
          print(f'Feature Made: {main_feature} / {feature}')
          print('')
      new_features[f"log1p {main_feature}"] = safe_log(df.loc[:, main_feature])
      new_features[f"square {main_feature}"] = df.loc[:, main_feature] ** 2
      new_features[f"sqrt {main_feature}"] = safe_sqrt(df.loc[:, main_feature])
      print(f"Feature Made: log1p {main_feature}")
      print(f'Feature Made: square {main_feature}')
      print(f'Feature Made: {main_feature}')
      print('')

    new_df = pd.concat([df, pd.DataFrame(new_features, index=df.index)], axis=1)
    new_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    list_of_return_df.append(new_df)

  return list_of_return_df, feature_pairs

In [None]:
def make_combo_features_iterations(list_of_df, i=2):

  list_of_return_df = []
  completed_features = []
  for _ in range(i):
    start_time = time.time()
    print('Starting iteration: ', _)
    print('Time: ', time.time() - start_time)
    print('')
    list_of_df, completed_features = combo(list_of_df, completed_features)

  return list_of_df

df_list = make_combo_features_iterations([train_df, test_df], 1)
train_df, test_df = df_list[0], df_list[1]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
6193  /  42025
Loading: [#######                                           ] 14.7%
Working on feature QualityScore of SinceRemodel
Time:  320.64791560173035

Feature Made: SinceRemodel + QualityScore
Feature Made: SinceRemodel - QualityScore
Feature Made: SinceRemodel * QualityScore
Feature Made: SinceRemodel / QualityScore

6194  /  42025
Loading: [#######                                           ] 14.7%
Working on feature RoomsPerArea of SinceRemodel
Time:  320.70020151138306

Feature Made: SinceRemodel + RoomsPerArea
Feature Made: SinceRemodel - RoomsPerArea
Feature Made: SinceRemodel * RoomsPerArea
Feature Made: SinceRemodel / RoomsPerArea

6195  /  42025
Loading: [#######                                           ] 14.7%
Working on feature IsSingleFam of SinceRemodel
Time:  320.75251817703247

Feature Made: SinceRemodel + IsSingleFam
Feature Made: SinceRemodel - IsSingleFam
Feature Made: SinceRemodel * IsSingleFam
F

### Missing Values

In [None]:
fill_none_cols = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                  'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                  'PoolQC', 'Fence', 'MiscFeature', 'MasVnrType']
for col in fill_none_cols:
    train_df[col] = train_df[col].fillna('None')
    test_df[col] = test_df[col].fillna('None')

train_df['MasVnrArea'] = train_df['MasVnrArea'].fillna(0)
test_df['MasVnrArea'] = test_df['MasVnrArea'].fillna(0)
train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(0)
test_df['GarageYrBlt'] = test_df['GarageYrBlt'].fillna(0)

cat_mode_cols = ['MSZoning', 'Electrical', 'KitchenQual', 'SaleType',
                 'Exterior1st', 'Exterior2nd', 'Functional']
for col in cat_mode_cols:
    train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
    test_df[col] = test_df[col].fillna(test_df[col].mode()[0])

## Encoding

### Ordinal Encoding

In [None]:
ordinal_maps = {
    "ExterQual": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "ExterCond": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "BsmtQual": {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "BsmtCond": {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "BsmtExposure": {"None": 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4},
    "KitchenQual": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "HeatingQC": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "FireplaceQu": {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "GarageQual": {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "GarageCond": {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "PoolQC": {"None": 0, "Fa": 1, "TA": 2, "Gd": 3, "Ex": 4},
    "Fence": {"None": 0, "MnWw": 1, "GdWo": 2, "MnPrv": 3, "GdPrv": 4},
    "Functional": {"Sal": 1, "Sev": 2, "Maj2": 3, "Maj1": 4, "Mod": 5, "Min2": 6, "Min1": 7, "Typ": 8},
}

# Apply ordinal mapping
for col, mapping in ordinal_maps.items():
    train_df[col] = train_df[col].map(mapping)
    test_df[col] = test_df[col].map(mapping)

### Target Encoding

In [None]:
target_encode_cols = ['Neighborhood', 'Exterior1st', 'Exterior2nd', 'Condition1', 'Condition2',
                      'SaleType', 'HouseStyle', 'RoofMatl']

# Create target mean encoding using KFold to avoid leakage
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for col in target_encode_cols:
    global_mean = train_df['SalePrice'].mean()
    train_df[col + "_te"] = 0

    for train_index, val_index in kf.split(train_df):
        train_fold, val_fold = train_df.iloc[train_index], train_df.iloc[val_index]
        means = train_fold.groupby(col)['SalePrice'].mean()
        train_df.loc[train_df.index[val_index], col + "_te"] = val_fold[col].map(means).fillna(global_mean)

    # Map means from full training set to test
    means = train_df.groupby(col)['SalePrice'].mean()
    test_df[col + "_te"] = test_df[col].map(means).fillna(global_mean)

# Drop original target-encoded columns
train_df.drop(columns=target_encode_cols, inplace=True)
test_df.drop(columns=target_encode_cols, inplace=True)

 11.87795354 11.80939876 11.66466055 12.29793046 12.56611131 12.17908471
 11.73446679 12.60754909 12.17908471 11.87795354 11.87795354 11.80939876
 11.94362079 11.87795354 12.05510203 11.73446679 12.17440154 12.29793046
 12.05510203 11.87795354 12.17908471 11.73446679 12.17440154 11.71207433
 11.66466055 12.17908471 12.29793046 11.73446679 12.17908471 12.05510203
 12.2090336  12.17908471 12.6429962  12.17908471 12.05510203 11.71207433
 12.29793046 12.05510203 11.87795354 11.73446679 12.17908471 12.29793046
 12.28918217 11.94362079 11.66466055 12.17440154 11.66466055 12.29793046
 12.05510203 12.60754909 12.17440154 12.13242409 12.05510203 11.87795354
 12.60754909 11.87795354 11.50581851 12.60754909 12.28918217 11.73446679
 11.85096839 12.28918217 11.66466055 11.51389107 11.52718564 11.87795354
 12.28918217 12.17908471 12.17908471 11.73446679 11.52718564 12.17440154
 11.73446679 12.17440154 11.80939876 11.94362079 12.6429962  11.73446679
 12.05510203 11.87795354 12.17908471 12.33881505 11

### One Hot Encoding

In [None]:
# ----- 3. One-Hot Encoding -----
# Select categorical columns that are not target or ordinal encoded
excluded_cols = list(ordinal_maps.keys()) + target_encode_cols
remaining_cat_cols = [col for col in train_df.select_dtypes(include='object').columns if col not in excluded_cols]

# One-hot encode these columns
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
train_ohe = pd.DataFrame(ohe.fit_transform(train_df[remaining_cat_cols]), index=train_df.index)
test_ohe = pd.DataFrame(ohe.transform(test_df[remaining_cat_cols]), index=test_df.index)

# Add one-hot encoded columns and drop original
train_df = train_df.drop(columns=remaining_cat_cols).join(train_ohe)
test_df = test_df.drop(columns=remaining_cat_cols).join(test_ohe)

In [None]:
# def remove_outliers(df, threshold=3, columns=None):
#   d = df.copy()
#   z = np.abs(stats.zscore(d[columns]))
#   outliers_idx = []
#   for index, row in z.iterrows():
#     if (row > threshold).any():
#       outliers_idx.append(index)
#   print(f'Number of removed rows = {len(outliers_idx)}')
#   return d.drop(outliers_idx), d.loc[outliers_idx]

# train_df, outliers = remove_outliers(train_df, columns=numerical)

In [None]:
train_df.to_csv("good_train_data.csv", index=False)
test_df.to_csv("good_test_data.csv", index=False)

In [None]:
test_df

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,104,105,106,107,108,109,110,111,112,113
0,-0.872563,0.451936,0.110763,-0.795151,0.381743,-0.340077,-1.156380,-0.574410,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.872563,0.497357,0.375850,-0.071836,0.381743,-0.439440,-1.301740,0.023903,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.073375,0.179413,0.332053,-0.795151,-0.517200,0.852269,0.636400,-0.574410,3,3,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.073375,0.361095,-0.054002,-0.071836,0.381743,0.885390,0.636400,-0.463612,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.492282,-1.228623,-0.552407,1.374795,-0.517200,0.686666,0.345679,-0.574410,4,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.438219,-2.227875,-0.859988,-1.518467,1.280685,-0.041991,-0.720298,-0.574410,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1455,2.438219,-2.227875,-0.864197,-1.518467,-0.517200,-0.041991,-0.720298,-0.574410,3,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1456,-0.872563,4.085578,0.950423,-0.795151,1.280685,-0.373198,0.539493,-0.574410,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1457,0.664586,-0.365633,-0.007600,-0.795151,-0.517200,0.686666,0.345679,-0.574410,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
train_df

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,104,105,106,107,108,109,110,111,112,113
0,0.073375,-0.229372,-0.207142,0.651479,-0.517200,1.050994,0.878668,0.511418,4,3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.574410,3,3,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.073375,-0.093110,0.073480,0.651479,-0.517200,0.984752,0.830215,0.323060,4,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.517200,-1.863632,-0.720298,-0.574410,3,3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.073375,0.633618,0.375148,1.374795,-0.517200,0.951632,0.733308,1.364570,4,3,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,-0.365633,-0.260560,-0.071836,-0.517200,0.918511,0.733308,-0.574410,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1456,-0.872563,0.679039,0.266407,-0.071836,0.381743,0.222975,0.151865,0.084843,3,3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1457,0.309859,-0.183951,-0.147810,0.651479,3.078570,-1.002492,1.024029,-0.574410,5,4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1458,-0.872563,-0.093110,-0.080160,-0.795151,0.381743,-0.704406,0.539493,-0.574410,3,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
