In [2]:
# Import libraries
import pandas  as pd
import numpy   as np
import xgboost as xgb

In [4]:
train_data = pd.read_csv('playround_train.csv',index_col=0)
test_data  = pd.read_csv('playround_test.csv',index_col=0)

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22730 entries, 0 to 22729
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   squareMeters       22730 non-null  int64  
 1   numberOfRooms      22730 non-null  int64  
 2   hasYard            22730 non-null  int64  
 3   hasPool            22730 non-null  int64  
 4   floors             22730 non-null  int64  
 5   cityCode           22730 non-null  int64  
 6   cityPartRange      22730 non-null  int64  
 7   numPrevOwners      22730 non-null  int64  
 8   made               22730 non-null  int64  
 9   isNewBuilt         22730 non-null  int64  
 10  hasStormProtector  22730 non-null  int64  
 11  basement           22730 non-null  int64  
 12  attic              22730 non-null  int64  
 13  garage             22730 non-null  int64  
 14  hasStorageRoom     22730 non-null  int64  
 15  hasGuestRoom       22730 non-null  int64  
 16  price              22730 no

In [11]:
# Prepare training features by excluding the 'SalePrice' column
X_train = train_data.drop(columns='price')

# Extract the 'SalePrice' column as the training target
y_train = train_data['price']

# Use the entire test dataset for prediction or evaluation
X_test = test_data

In [13]:
X_train.shape, y_train.shape, X_test.shape

((22730, 16), (22730,), (15154, 16))

In [18]:
X_train.isnull().sum()

squareMeters         0
numberOfRooms        0
hasYard              0
hasPool              0
floors               0
cityCode             0
cityPartRange        0
numPrevOwners        0
made                 0
isNewBuilt           0
hasStormProtector    0
basement             0
attic                0
garage               0
hasStorageRoom       0
hasGuestRoom         0
dtype: int64

In [19]:
# Initialize XGBoost model
model = xgb.XGBRegressor()

In [20]:
# Fit XGBoost model with training data
model.fit(X_train, y_train)

In [21]:
from sklearn.inspection import permutation_importance

# Perform permutation importance analysis to assess feature importance
r = permutation_importance(model, X_train, y_train, n_repeats=30, random_state=0)

In [23]:
# Initialize an empty list to store features with permutation importance above a certain threshold
perm_features = []

# Iterate through features based on their importance in descending order
for i in r.importances_mean.argsort()[::-1]:
    # Check if the permutation importance is higher than 2 times its standard deviation
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        # Append the feature name to the perm_features list
        perm_features.append(X_train.columns[i])

In [25]:
from sklearn.feature_selection import RFE

# Initialize Recursive Feature Elimination (RFE) with the provided model and set the number of features to select to 1
rfe = RFE(model, n_features_to_select=1)

# Fit RFE to the training data to select the most important features
rfe.fit(X_train, y_train)

In [27]:
rfe_features = []  
from operator import itemgetter
# Sort the features based on their ranking provided by RFE and store the names in rfe_features
for x, y in (sorted(zip(rfe.ranking_, X_train.columns), key=itemgetter(0))):
    # Append the feature names to the rfe_features list in the sorted order
    rfe_features.append(y)

In [32]:
# Combine top 6 features selected by RFE and top 5 features identified by permutation importance
combined_features = list(set(rfe_features[0:6]) | set(perm_features[0:6]))

In [34]:
# Filter the training and testing data to include only the combined selected features
X_train = X_train[combined_features]
X_test = X_test[combined_features]

In [35]:
X_train

Unnamed: 0_level_0,floors,basement,hasYard,numberOfRooms,garage,squareMeters,attic,cityCode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,47,8,1,24,369,34291,5196,35693
1,60,729,0,60,277,95145,4496,34773
2,62,7473,1,45,245,92661,8953,45457
3,59,6424,0,99,256,97184,8522,15113
4,57,7151,0,100,863,61752,2786,64245
...,...,...,...,...,...,...,...,...
22725,70,4477,1,84,345,55825,786,12031
22726,49,4811,1,88,755,65870,2454,23197
22727,39,5595,1,42,789,93192,4072,8539
22728,89,5358,1,86,411,65797,2513,23197


In [36]:
regressor=xgb.XGBRegressor(eval_metric='rmsle')

In [37]:
from sklearn.model_selection import GridSearchCV
# set up our search grid
param_grid = {"max_depth":    [4, 5],
              "n_estimators": [500, 600, 700],
              "learning_rate": [0.01, 0.015]}

In [38]:
# Fit regressor with the training data
regressor.fit(X_train, y_train)

In [39]:
# Perform a grid search for hyperparameter tuning using GridSearchCV
search = GridSearchCV(regressor, param_grid=param_grid)

In [40]:
# Fit the grid search to the training data to find the best hyperparameters
search.fit(X_train, y_train)

In [41]:
# Print the best hyperparameters obtained from GridSearchCV
search.best_params_

{'learning_rate': 0.015, 'max_depth': 5, 'n_estimators': 500}

In [42]:
# Create a XGBoost model with the GridSearchCV chosen parameters
final_XGB = xgb.XGBRegressor(learning_rate=0.015, max_depth=5, n_estimators=500)

In [43]:
# Fit the finalized XGBoost model with the training data
final_XGB.fit(X_train, y_train)

In [44]:
# Generate predictions using the trained XGBoost model on the test data
y_pred = final_XGB.predict(X_test)

In [49]:
# Generate predictions on the training data using the trained XGBoost model
predictions = final_XGB.predict(X_train)

In [51]:
# Read the ground truth file and extract the true sale prices
solution = pd.read_csv('playround_train.csv', index_col=0)
y_true = solution["price"]

from sklearn.metrics import mean_squared_error

# Calculate Root Mean Squared Log Error (RMSLE) between true and predicted sale prices
RMSE = np.sqrt(mean_squared_error(y_true, predictions))
print("The score is %.5f" % RMSE)

The score is 90078.45076
