In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool, metrics, cv

In [4]:
test_df = pd.read_csv('data/test.csv')
train_df = pd.read_csv('data/train.csv')
sample_submission_df = pd.read_csv('data/sample_submission.csv')

In [5]:
y = train_df.SalePrice
X = train_df.drop(['SalePrice', 'Id'], axis=1)

In [6]:
submission_id = test_df.Id
final_test_df = test_df.drop(['Id'], axis=1)

In [7]:
print(X.shape, final_test_df.shape)

(1460, 79) (1459, 79)


In [8]:
df = pd.concat([X, final_test_df])

In [9]:
df.shape

(2919, 79)

In [10]:
df.fillna({'LotFrontage': X.LotFrontage.median()}, inplace=True)
df.fillna(0, inplace=True)

In [11]:
X = df.iloc[:1460]
final_test_df = df.iloc[1460:]

In [12]:
print(X.shape, final_test_df.shape)

(1460, 79) (1459, 79)


In [13]:
X.dtypes

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 79, dtype: object

In [14]:
categorical_features_indices = np.where((X.dtypes != 'float64') & (X.dtypes != 'int64'))[0]

In [15]:
categorical_features_indices

array([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23,
       24, 26, 27, 28, 29, 30, 31, 32, 34, 38, 39, 40, 41, 52, 54, 56, 57,
       59, 62, 63, 64, 71, 72, 73, 77, 78], dtype=int64)

In [16]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=0)

In [19]:
model = CatBoostRegressor(
    random_seed=42,
    iterations=150,
    logging_level='Silent'
)

In [20]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [23]:
y_pred = model.predict(final_test_df)

In [24]:
submission = pd.DataFrame({'Id': submission_id, 'SalePrice': y_pred})

In [25]:
submission.to_csv('CatBoost.csv',index=False)