In [28]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from scipy.stats import skew
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.ensemble import IsolationForest
from typing import List, Dict

warnings.filterwarnings("ignore")

In [29]:
PLOT_FIGURES = False

df_train = pd.read_csv(os.path.join(os.getcwd(), "train.csv"))
df_test = pd.read_csv(os.path.join(os.getcwd(), "test.csv"))
y = df_train["SalePrice"]
print(f"skew: {y.skew():.3f}, kurt: {y.kurt():.3f}")

if PLOT_FIGURES:
  fig = plt.figure()
  fig.set_figwidth(10)
  fig.set_figheight(1)
  fig.add_subplot(1, 2, 1)
  sns.histplot(y, kde=True, linewidth=0)
  y = np.log1p(df_train["SalePrice"])
  fig.add_subplot(1, 2, 2)
  sns.histplot(y, kde=True, linewidth=0)
  plt.show()
else:
  y = np.log1p(df_train["SalePrice"])

df_result = pd.concat([
  df_train.drop(["SalePrice", "Id"], axis=1), 
  df_test.drop(["Id"], axis=1)
]).reset_index(drop=True)

skew: 1.883, kurt: 6.536


In [30]:
def drop_na_columns(df: pd.DataFrame) -> pd.DataFrame:
  count = df.isna().sum().sort_values(ascending=False).values
  percentage = df.isna().sum().sort_values(ascending=False) / len(df) * 100.0
  df_percentage = pd.DataFrame({"count": count, "percentage": percentage})
  df_percentage["percentage"] = df_percentage["percentage"].apply(lambda i: round(i, 2))
  df_result = df.drop(df_percentage[df_percentage["percentage"] >= 10.0].index, axis=1)
  cols_number_na = df_result.select_dtypes(include=np.number).isna().sum().sort_values(ascending=False)
  cols_number_na = cols_number_na[cols_number_na > 0].index.tolist()
  assert len(cols_number_na) > 0
  return df_result

df_result = drop_na_columns(df_result)

In [31]:
def fill_na(df: pd.DataFrame) -> pd.DataFrame:
  df_number = df.select_dtypes(include=np.number)
  imputer = KNNImputer(n_neighbors=5, weights="uniform", missing_values=np.nan)
  df_number[:] = imputer.fit_transform(df_number)
  df_result = pd.concat([df_number, df.select_dtypes(exclude=np.number)], axis=1)
  summed = df_result.isna().sum().sort_values(ascending=False) > 0

  for i in summed[summed == True].index.tolist():
    df_result[i] = df_result[i].fillna(df_result[i].mode()[0])

  assert len(list(filter(lambda i: i == True, df_result.isna().any().values.tolist()))) == 0
  return df_result
df_result = fill_na(df_result)

In [32]:
def cap_outliers(df: pd.DataFrame) -> pd.DataFrame:
  rng = np.random.RandomState(1)
  cols_non_object: List[str] = []

  for i in df.columns:
    if i not in df.select_dtypes("object").columns.tolist():
      cols_non_object.append(i)

  for i in cols_non_object:
    if PLOT_FIGURES:
      fig = plt.figure()
      plt.title(f"{i} - left: before | right: after")
      fig.add_subplot(1, 2, 1)
      fig.set_figwidth(10)
      fig.set_figheight(3)
      sns.scatterplot(x=df[i], y=y)
    df_outliers = pd.DataFrame()
    df_outliers[i] = df[i]
    isof = IsolationForest(max_samples=100, random_state=rng)
    isof.fit(df[[i]])
    df_outliers["is_outlier"] = isof.predict(df[[i]])
    df_outliers["anomaly_score"] = isof.decision_function(df[[i]])
    quant_min, quant_max = df[i].quantile([0.2, 0.8])

    for j in range(0, len(df_outliers[i])):
      if df_outliers["is_outlier"][j] == -1:
        if df_outliers[i][j] < quant_min:
          df[i][j] = quant_min
        elif df_outliers[i][j] > quant_max:
          df[i][j] = quant_max

    if PLOT_FIGURES:
      fig.add_subplot(1, 2, 2)
      sns.scatterplot(x=df[i], y=y)
      plt.show()

  return df

df_result = cap_outliers(df_result)

In [33]:
def fix_skewness(df: pd.DataFrame) -> pd.DataFrame:
  cols_number = df.select_dtypes(include=np.number).columns
  df_skewed = df[cols_number].apply(lambda i: skew(i))
  skewed_cols = df_skewed.loc[df_skewed > 0.5].index.tolist()
  
  for i in skewed_cols:
    df[i] = np.log1p(df[i])

  return df

df_result = fix_skewness(df_result)

In [34]:
FOLDS = 5

def get_model_scores(X_train: pd.DataFrame, X_test: pd.DataFrame) -> pd.DataFrame:
  models = [
    LinearRegression(), Lasso(), Ridge(), 
    KNeighborsRegressor(), RandomForestRegressor(), DecisionTreeRegressor(), 
    ElasticNet(), XGBRegressor()
  ]
  cv_scores = []
  model_indices = []
  
  for i in range(len(models)):
    cv_score = cross_val_score(models[i], X_train, y, cv=FOLDS)
    cv_scores.append(cv_score.mean())
    model_indices.append(str(models[i]))
  
  df = pd.DataFrame(index=model_indices, data=cv_scores, columns=["score_mean"])
  df["score_mean"] = df["score_mean"].apply(lambda i: "%.3f" % i)
  return df

X_train = df_result.iloc[0:len(y), :]
X_test = df_result.iloc[len(y):, :]
ohe = OneHotEncoder(handle_unknown="ignore")
ohe.fit_transform(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)
df_scores = get_model_scores(X_train, X_test)
df_scores.sort_values(by="score_mean", ascending=False)

Unnamed: 0,score_mean
"XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,\n colsample_bynode=None, colsample_bytree=None,\n enable_categorical=False, gamma=None, gpu_id=None,\n importance_type=None, interaction_constraints=None,\n learning_rate=None, max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n n_estimators=100, n_jobs=None, num_parallel_tree=None,\n predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,\n scale_pos_weight=None, subsample=None, tree_method=None,\n validate_parameters=None, verbosity=None)",0.868
Ridge(),0.86
RandomForestRegressor(),0.848
LinearRegression(),0.83
KNeighborsRegressor(),0.824
DecisionTreeRegressor(),0.739
Lasso(),-0.003
ElasticNet(),-0.003


In [35]:
hyper_params = {
  "Ridge()": {
    "alpha": [0.1, 0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0],
  },
  "RandomForestRegressor()": {
    "n_estimators": [10, 100, 200, 400, 800],
    "max_depth": [1, 2, 4, 8, 16]
  },
  "XGBRegressor()": {
    'max_depth': [3,6,10],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 500, 1000],
    'colsample_bytree': [0.3, 0.7]
  }
}

gs = GridSearchCV(estimator=XGBRegressor(), param_grid=hyper_params.get(str("XGBRegressor()")))
gs.fit(X_train, y)
print(f"Estimator: {gs.best_estimator_}, Score: {gs.best_score_}")
xgboost = gs.best_estimator_.fit(X_train, y)

Estimator: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None), Score: 0.890343261702464


In [36]:
xgboost.score(X_train, y)
cross_val_score(xgboost, X_train, y, cv=FOLDS)

array([0.90655132, 0.87274713, 0.89191008, 0.89342617, 0.88708161])

In [37]:
submission = pd.read_csv(os.path.join(os.getcwd(), "sample_submission.csv"))
result = np.floor(np.expm1(xgboost.predict(X_test)))
submission.iloc[:, 1] = result
submission.to_csv('submission.csv', index=False)