In [24]:
import pandas as pd

# Load the training and test data
train_df = pd.read_csv('/kaggle/input/house-prices/train.csv')
test_df = pd.read_csv('/kaggle/input/house-prices/test.csv')
acu_chk = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

# Display the first few rows of each dataframe
train_df_head = train_df.head()
test_df_head = test_df.head()

train_df_head

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [25]:
test_df_head

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

# Separate target variable and features in training data
X_train_full = train_df.drop('SalePrice', axis=1)
y_train_full = train_df['SalePrice']

# For simplicity, we will impute missing values and encode categorical variables
# Identify categorical columns with string data
categorical_cols = [cname for cname in X_train_full.columns if 
                    X_train_full[cname].dtype == "object"]

# Identify numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                  X_train_full[cname].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocess the training data
X_train_preprocessed = preprocessor.fit_transform(X_train_full)

# Split the training data for model training and validation
X_train, X_valid, y_train, y_valid = train_test_split(X_train_preprocessed, y_train_full, test_size=0.2, random_state=0)

print("X_train shape is: ", X_train.shape)
print("X_valid shape is: ",X_valid.shape)
print("y_train shape is: ",y_train.shape)
print("y_valid shape is: ",y_valid.shape)


X_train shape is:  (1168, 289)
X_valid shape is:  (292, 289)
y_train shape is:  (1168,)
y_valid shape is:  (292,)


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
xgb_model = XGBRegressor()

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [10000, 3000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.7, 0.9]
}

# Set up the grid search with 3-fold cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)  # converting from MSE to RMSE

best_params, best_score


In [None]:
# Preprocess the entire training dataset
X_train_full_preprocessed = preprocessor.transform(X_train_full)

# Train the model with the best hyperparameters on the full training data
final_model = XGBRegressor(
    learning_rate=0.01,n_estimators=10000,
     max_depth=5, min_child_weight=0,
     gamma=0, subsample=0.7,
     objective='reg:squarederror', nthread=-1,
     scale_pos_weight=1, seed=27,
     reg_alpha=0.00006
)

final_model.fit(X_train_full_preprocessed, y_train_full)

# Preprocess the test data
X_test_preprocessed = preprocessor.transform(test_df)

# Make predictions on the test data
predictions = final_model.predict(X_test_preprocessed)

predictions[:100]  # Display the first 10 predictions


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(final_model, X_train_full_preprocessed, y_train_full,
                         scoring='neg_root_mean_squared_error', cv=5)
rmse_scores = -scores  # Convert to positive RMSE scores


In [None]:
rmse_mean = rmse_scores.mean()
rmse_std = rmse_scores.std()
print(rmse_mean, rmse_std)

In [None]:
actual_prices_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

predictions_df = pd.DataFrame({
    'Id': test_df['Id'],
    'PredictedSalePrice': predictions
})
results_df = pd.merge(actual_prices_df, predictions_df, on='Id')
from sklearn.metrics import mean_squared_error
from math import sqrt

results_df

In [None]:
rmse = sqrt(mean_squared_error(results_df['SalePrice'], results_df['PredictedSalePrice']))
rmse