# 5 - Modeling

In [78]:
import pandas as pd
train = pd.read_pickle('../pickles/final/round_1/train_final')
test = pd.read_pickle('../pickles/final/round_1/test_final')
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import warnings
# warnings.filterwarnings('ignore')

There are some columns in the dataframes that are the 'object' dtype, which is not compatible with XGBoost. These need to be converted to 'Int' columns.

In [79]:
# select object type columns
train_col = train.select_dtypes(include='object').columns.tolist()

# apply conversion
train[train_col] = train[train_col].astype(int)

In [80]:
# select object type columns
test_col = test.select_dtypes(include='object').columns.tolist()

# apply conversion
test[test_col] = test[test_col].astype(int)

### Parameter Tuning

In [81]:
boost = XGBRegressor(objective='reg:squarederror',random_state=42)
X_train = train.drop('SalePrice',axis=1)
y_train = train['SalePrice']

In [82]:
params = {
    'alpha': [25,30,35],
    'lambda': [45,50,55],
    'max_depth': [4,5,6],
    'learning_rate': [0.01,0.1,1],
    'n_estimators': [555,560,565,]}

In [83]:
search = GridSearchCV(boost, params, scoring='neg_root_mean_squared_error',n_jobs=-1)
result = search.fit(X_train,y_train)
print('Best Score: %s' % (result.best_score_*-1))
print('Best Hyperparameters: %s' % result.best_params_)

  _data = np.array(data, dtype=dtype, copy=copy,


Best Score: 23761.452261334955
Best Hyperparameters: {'alpha': 30, 'lambda': 50, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 560}


### The Model

In [84]:
model = XGBRegressor(reg_alpha=30,reg_lambda=50,learning_rate=0.1,max_depth=5,n_estimators=560,objective='reg:squarederror',random_state=42)
model.fit(X_train,y_train)

pred_train = model.predict(X_train)

In [85]:
rmse = root_mean_squared_error(y_train, pred_train)
r2 = r2_score(y_train, pred_train)

print(f'Root Mean Squared Error: {rmse} \nR-Squared: {r2}')

Root Mean Squared Error: 6321.552034749748 
R-Squared: 0.9931768774986267


In [86]:
pred_test = model.predict(test)

Submission format is Id and Saleprice columns for test

In [87]:
for_cols = pd.read_pickle('../pickles/cleaned/test_cleaned')

In [88]:
submit = pd.DataFrame(data=for_cols['Id'])
submit.head()

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465


In [89]:
submit['SalePrice'] = pred_test.tolist()
submit.head()

Unnamed: 0,Id,SalePrice
0,1461,124257.859375
1,1462,170114.53125
2,1463,175484.15625
3,1464,199337.265625
4,1465,183157.828125


In [90]:
submit.to_csv('../data/submission1.csv',index=False)