# Step 1: Importing libraries and raw data

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt



In [2]:
pwd = os.getcwd()
train_data= pd.read_csv(pwd + "/train.csv")
test_data = pd.read_csv(pwd + "/test.csv")


In [3]:
train_data = pd.read_csv(pwd + "/train.csv",index_col=0)
test_data = pd.read_csv(pwd + "/test.csv",index_col=0)

# Step 2: Exploring data

In [4]:
train_data_original = train_data.copy()
train_y = np.log1p(train_data.pop("SalePrice"))
train_data['LotArea'] = np.log1p(train_data['LotArea'])
train_data['GrLivArea'] = np.log1p(train_data['GrLivArea'])


# Step 3: Feature Engineering

In [5]:
all_data = pd.concat((train_data,test_data),axis = 0)


In [6]:
non_num_feat = all_data.select_dtypes(exclude=['int64','float64'])
non_num_feat_with_missing = []
for col in non_num_feat.columns:
    if all_data[col].isnull().any():
        non_num_feat_with_missing.append(col)
        



In [7]:
all_data[['Alley','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature',"MSSubClass","OverallQual","OverallCond"]] = all_data[['Alley','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature',"MSSubClass","OverallQual","OverallCond"]].astype(str)
fill_None = ['Alley','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature',"MSSubClass","OverallQual","OverallCond"]
all_data_temp = all_data.copy()

for col in fill_None:
    all_data[col] = all_data[col].fillna(col+'data none')
    all_data[col] = all_data[col].replace('nan', col+'data none')


all_data["Exterior2nd"] = all_data["Exterior2nd"].fillna('SameasEx1st')




for col in non_num_feat_with_missing:
    all_data[col] = all_data[col].fillna(train_data_original[col].mode()[0])



In [8]:
num_feat = all_data_temp.select_dtypes(exclude=['object'])
num_feat_with_missing = []

for col in num_feat:
    if all_data[col].isnull().any():
        num_feat_with_missing.append(col)
        

In [9]:
fill_zero = ['MasVnrArea','BsmtFinSF1', 'BsmtFinSF2','BsmtUnfSF', 'TotalBsmtSF','GarageArea']
all_data[fill_zero] = all_data[fill_zero].fillna(0)

fill_median = ['LotFrontage','BsmtFullBath', 'BsmtHalfBath','GarageYrBlt', 'GarageCars']
all_data[col] = pd.to_numeric(all_data[col], errors='coerce')
for col in fill_median:
    all_data[col] = all_data[col].fillna(train_data[col].median())
    all_data[col] = all_data[col].astype('float64')


In [10]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Ordinal
ordinal_feat = ["OverallQual", "OverallCond", "ExterQual", "KitchenQual", "BsmtQual", "GarageQual","ExterCond","HeatingQC","BsmtCond","GarageCond"]
ordinal_cat = [['1','2','3','4','5','6','7','8','9','10'],['1','2','3','4','5','6','7','8','9','10'],['Po','Fa','TA','Gd','Ex'],['Po','Fa','TA','Gd','Ex'],['BsmtQualdata none','Po','Fa','TA','Gd','Ex'],['GarageQualdata none','Po','Fa','TA','Gd','Ex'],['Po','Fa','TA','Gd','Ex'],['Po','Fa','TA','Gd','Ex'],['BsmtConddata none','Po','Fa','TA','Gd','Ex'],['GarageConddata none','Po','Fa','TA','Gd','Ex']]

# Onehot
nonordinal_feat= [col for col in all_data.select_dtypes(include='object') if col not in ordinal_feat]
onehot_feat = [col for col in nonordinal_feat if all_data[col].nunique() <= 10]
label_feat = [col for col in nonordinal_feat if all_data[col].nunique() > 10]

encoder = ColumnTransformer(
    transformers=[
        
        ('ord_encoder', OrdinalEncoder(categories=ordinal_cat), ordinal_feat),
       
        ('onehot_encoder', OneHotEncoder(drop='first', sparse_output=False, dtype=int), onehot_feat)
    ],remainder='passthrough'
    
)

encoded_data = all_data[ordinal_feat + onehot_feat]
encoded_result = encoder.fit_transform(encoded_data)

onehot_col_names = encoder.named_transformers_['onehot_encoder'].get_feature_names_out(onehot_feat)

all_encoded_col_names = ordinal_feat + list(onehot_col_names)

encoded_df = pd.DataFrame(
    data=encoded_result,       
    columns=all_encoded_col_names,  
    index=all_data.index       
)

label_encoded_df = pd.DataFrame(index=all_data.index)
for col in label_feat:
    le = LabelEncoder()
    label_encoded_df[col] = le.fit_transform(all_data[col].astype(str))

all_data_final = all_data.drop(columns=ordinal_feat + onehot_feat + label_feat)  
all_data_final = pd.concat([all_data_final, encoded_df, label_encoded_df], axis=1) 

In [11]:
train_data = all_data_final.iloc[:1460,:].copy()
test_data = all_data_final.iloc[1460:,:].copy()
train_data = pd.concat((train_data,train_y),axis = 1)

In [12]:
ext_col = ["SalePrice","GrLivArea","LotArea","TotalBsmtSF","MiscVal"]
lower_list = []
upper_list = []
for col in ext_col:
    q1 = train_data[col].quantile(0.25)
    q3 = train_data[col].quantile(0.75)
    iqr = q3-q1
    lower_list.append(q1-2*iqr)
    upper_list.append(q3+2*iqr)


In [13]:
train_data = train_data[(train_data['SalePrice'] >= lower_list[0]) 
                        & (train_data['SalePrice'] <= upper_list[0]) 
                        & (train_data["GrLivArea"] >= lower_list[1])
                        & (train_data["GrLivArea"] <= upper_list[1])
                        & (train_data['LotArea'] <= upper_list[2])
                        & (train_data['TotalBsmtSF'] <= upper_list[3])
                        & (train_data["MiscVal"] <= upper_list[4])]

train_data = train_data[~((train_data['GrLivArea'] > np.log1p(4000)) & (train_data['SalePrice'] < np.log1p(300000)))]
                        


In [14]:
train_data["TotalLivingArea"] = train_data["GrLivArea"] + train_data["TotalBsmtSF"]
test_data["TotalLivingArea"] = test_data["GrLivArea"] + test_data["TotalBsmtSF"]

train_data["LivingLotRatio"] = np.where(
    train_data["LotArea"] > 0,
    train_data["GrLivArea"] / train_data["LotArea"],
    0
)
test_data["LivingLotRatio"] = np.where(
    test_data["LotArea"] > 0,
    test_data["GrLivArea"] / test_data["LotArea"],
    0
)

train_data["HouseAge"] = train_data["YrSold"] - train_data["YearBuilt"]
test_data["HouseAge"] = test_data["YrSold"] - test_data["YearBuilt"]

train_data["RemodAge"] = train_data["YrSold"] - train_data["YearRemodAdd"]
test_data["RemodAge"] = test_data["YrSold"] - test_data["YearRemodAdd"]

train_data["TotalBathEquiv"] = train_data["FullBath"] + (train_data["HalfBath"]*0.5)
test_data["TotalBathEquiv"] = test_data["FullBath"] + (test_data["HalfBath"]*0.5)

train_data["TotalPorchArea"] = (
    train_data["WoodDeckSF"] + train_data["OpenPorchSF"] + 
    train_data["EnclosedPorch"] + train_data["3SsnPorch"] + 
    train_data["ScreenPorch"]
)
test_data["TotalPorchArea"] = (
    test_data["WoodDeckSF"] + test_data["OpenPorchSF"] + 
    test_data["EnclosedPorch"] + test_data["3SsnPorch"] + 
    test_data["ScreenPorch"]
)

train_data["GarageAreaPerCar"] = np.where(
    train_data["GarageCars"] > 0,
    train_data["GarageArea"] / train_data["GarageCars"],
    0
)
test_data["GarageAreaPerCar"] = np.where(
    test_data["GarageCars"] > 0,
    test_data["GarageArea"] / test_data["GarageCars"],
    0
)

train_data["LivingAreaPerBedroom"] = np.where(
    train_data["BedroomAbvGr"] > 0,
    train_data["GrLivArea"] / train_data["BedroomAbvGr"],
    0
)
test_data["LivingAreaPerBedroom"] = np.where(
    test_data["BedroomAbvGr"] > 0,
    test_data["GrLivArea"] / test_data["BedroomAbvGr"],
    0
)

# Step 4: Modeling

In [15]:
y = pd.DataFrame(train_data["SalePrice"])
X = pd.DataFrame(train_data).copy()
X = pd.DataFrame(X.drop("SalePrice",axis=1))
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [16]:
import xgboost as xg
from sklearn.model_selection import cross_val_score, KFold


depth = [3,4,5,6]
mcw = [1,2]
best_rmse_dm = float('inf')
best_depth = 0
best_mcw = 0
for d in depth:
    for m in mcw:
        my_model = xg.XGBRegressor(
        n_estimators = 200,
        subsample=0.8,
        learning_rate = 0.05,
        n_jobs=-1,
        random_state = 42,
        max_depth=d,
        min_child_weight = m)
        kf = KFold(n_splits=5,shuffle=True,random_state=42)
        rmse = np.sqrt(-cross_val_score(my_model,X_train,y_train,scoring='neg_mean_squared_error',cv=kf))
        cur_rmse = rmse.mean()
        if cur_rmse < best_rmse_dm:
            best_rmse_dm = cur_rmse
            best_mcw = m
            best_depth = d





In [17]:
gamma = [0,0.1,0.2,0.3]
best_gamma = 0
best_rmse_g = float('inf')

for g in gamma:
        my_model = xg.XGBRegressor(
        n_estimators = 200,
        subsample=0.8,
        learning_rate = 0.05,
        n_jobs=-1,
        random_state = 42,
        max_depth=best_depth,
        min_child_weight = best_mcw,
        gamma = g)
        kf = KFold(n_splits=5,shuffle=True,random_state=42)
        rmse = np.sqrt(-cross_val_score(my_model,X_train,y_train,scoring='neg_mean_squared_error',cv=kf))
        cur_rmse = rmse.mean()
        if cur_rmse < best_rmse_g:
            best_rmse_g = cur_rmse
            best_gamma = g


In [18]:
learning_rates = [0.01,0.03, 0.05, 0.07]
best_lr = 0.05
best_n_estimators = 200
best_val_rmse = float('inf')

for lr in learning_rates:
    my_model = xg.XGBRegressor(
        n_estimators = 1000,
        subsample=0.8,
        learning_rate = lr,
        n_jobs=-1,
        random_state = 42,
        max_depth=best_depth,
        min_child_weight = best_mcw,
        gamma = best_gamma,
        early_stopping_rounds = 50)
    
    my_model.fit(X_train, y_train,eval_set = [(X_val, y_val)],verbose = False )
    val_pred = my_model.predict(X_val)
    val_rmse = np.sqrt(np.mean((val_pred - y_val.values)** 2))
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_lr = lr
        best_n_estimators = my_model.best_iteration

In [20]:
final_model = xg.XGBRegressor(
    n_estimators = best_n_estimators,
    subsample = 0.8,
    learning_rate = best_lr,
    n_jobs = -1,
    random_state = 42,
    max_depth = best_depth,
    gamma = best_gamma,
    min_child_weight = best_mcw)


final_model.fit(X_train,y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [21]:
prediction = np.expm1(final_model.predict(test_data))
test_data_org = pd.read_csv(pwd + "/test.csv")
prediction_id = test_data_org["Id"]
submission = pd.DataFrame({
    'Id': prediction_id,          
    'SalePrice': prediction  
})


In [22]:
submission.to_csv('submission.csv', index=False)