In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime
import shutil
import seaborn as sns
from xgboost import XGBRegressor
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [7]:
test_dir = "../input/house-prices-advanced-regression-techniques/test.csv"
train_dir = "../input/house-prices-advanced-regression-techniques/train.csv"

evaluation_data = pd.read_csv(test_dir, index_col="Id")
train_data = pd.read_csv(train_dir, index_col="Id")

y_train = train_data["SalePrice"]
x_train = train_data.drop("SalePrice", axis=1)

train_x, test_x, train_y, test_y = train_test_split(x_train, y_train, train_size=0.8, test_size=0.2, random_state=1)

In [None]:
to_drop = [item for item in train_data.corr()["SalePrice"].index
          if train_data.corr()["SalePrice"][item] <= 0.2]

In [None]:
#parameters searching
learning_rate = [x/1000 for x in range(1,10)]
learning_rate = [0.08]
best={}
# Pipeline with onehot encoding and number inputer
for l_rate in learning_rate:
    for estimators in range(150, 300, 10):
        numerical_col = [col for col in x_train.columns
                          if x_train[col].dtype in ["int64", "float64"]]

        categorical_col = [col for col in x_train.columns
                        if x_train[col].dtype in ["object"]]

        numerical_imputer = SimpleImputer()

        categorical_transformation = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            #("inputer", OrdinalEncoder())
            ("OneHot", OneHotEncoder(handle_unknown="ignore"))
        ])

        preprocess = ColumnTransformer(
            transformers=[
                ("numerical", numerical_imputer, numerical_col),
                ("categorical", categorical_transformation, categorical_col)
            ])

        xgdb = XGBRegressor(n_estimators=estimators, random_state=0, learning_rate=l_rate)
        #xgdb = RandomForestRegressor(n_estimators=1600, random_state=0)


        model = Pipeline(steps=[
            ("preprocessor", preprocess),
            ("model", xgdb)
        ])

        data_to_transform = [train_x[numerical_col], test_x[numerical_col], train_y, test_y]
        train_x[numerical_col], test_x[numerical_col], train_y, test_y = map(lambda x: (x-x.mean())/x.std(), data_to_transform)

        model.fit(train_x, train_y)
        prediction = model.predict(test_x)
        msr = mean_squared_error(y_true=test_y, y_pred=prediction)
        best[f"{l_rate} and {estimators}"] = msr

In [None]:
min(best, key=best.get)

In [3]:
#Pipeline for XDBOOST
numerical_col = [col for col in x_train.columns
                  if x_train[col].dtype in ["int64", "float64"]]

categorical_col = [col for col in x_train.columns
                if x_train[col].dtype in ["object"]]

numerical_imputer = SimpleImputer()

categorical_transformation = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ("OneHot", OneHotEncoder(handle_unknown="ignore"))
    #("OneHot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("numerical", numerical_imputer, numerical_col),
        ("categorical", categorical_transformation, categorical_col)
    ])

xgdb = XGBRegressor(n_estimators=190, random_state=0, learning_rate=0.08)

model = Pipeline(steps=[
    ("preprocessor", preprocess),
    ("model", xgdb)
])

model.fit(train_x, train_y)
prediction = model.predict(test_x)
msr = mean_squared_error(y_true=test_y, y_pred=prediction)
msr

In [None]:
model.score(test_x,test_y)

In [8]:
#Data preparation for neural network
y_training = train_data["SalePrice"]
x_training = train_data.drop("SalePrice", axis=1)

to_drop = [item for item in train_data.corr()["SalePrice"].index
          if train_data.corr()["SalePrice"][item] <= 0.2]

numerical_col = [col for col in x_training.columns
                          if x_training[col].dtype in ["int64", "float64"]]

categorical_col = [col for col in x_training.columns
                if x_training[col].dtype in ["object"]]

train_x, test_x, train_y, test_y = train_test_split(x_training, y_training, train_size=0.8, test_size=0.2, random_state=10)
data_to_transform = [train_x[numerical_col], test_x[numerical_col], train_y, test_y]
train_x[numerical_col], test_x[numerical_col], train_y, test_y = map(lambda x: (x-x.mean())/x.std(), data_to_transform)

columns = train_x.columns
numerical_imputer = SimpleImputer()

categorical_transformation = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder())
    #("OneHot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("numerical", numerical_imputer, numerical_col),
        ("categorical", categorical_transformation, categorical_col)
    ])

train_x = preprocess.fit_transform(train_x)
test_x = preprocess.fit_transform(test_x)

train_x = pd.DataFrame(train_x, columns=columns)
test_x = pd.DataFrame(test_x, columns=columns)

In [11]:
#Neural Network Model
model = Sequential()
model.add(Dense(train_x.shape[1], activation="relu", name="dense_1"))
model.add(Dense(512, activation="relu", name="dense_2"))
model.add(Dense(256, activation="relu", name="dense_3"))
model.add(Dense(1, activation="linear", name="Activation"))
model.compile(loss="mean_squared_error", optimizer="adam")

tensorboard_log = tf.keras.callbacks.TensorBoard(f"./{datetime.datetime.now().strftime('%d-%m-%y %H:%M')}", histogram_freq=1)
EPOCHS = 5
BATCH = 1
model.fit(train_x, train_y, epochs=EPOCHS, batch_size=BATCH, validation_data=(test_x, test_y), callbacks=[tensorboard_log])
shutil.make_archive("tensorboard", 'zip', "./")

In [None]:
plt.figure(figsize=(16,8))
sns.regplot(prediction, test_y)

In [4]:
# Save test predictions to file
evaluation_prediction = model.predict(evaluation_data)
output = pd.DataFrame({'Id': evaluation_data.index,
                       'SalePrice': evaluation_prediction})
output.to_csv('submission.csv', index=False)

In [13]:
# Save test predictions to file(neural network)
evaluation = evaluation_data.copy()
evaluation = preprocess.fit_transform(evaluation)
evaluation = pd.DataFrame(evaluation, columns=columns)
evaluation_prediction = model.predict(evaluation)
evaluation_prediction = evaluation_prediction.reshape(-1)
output = pd.DataFrame({'Id': evaluation_data.index,
                       'SalePrice': evaluation_prediction})
output.to_csv('submission.csv', index=False)