In [45]:
import pandas as pd
pd.set_option("display.max_columns",None)

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from mrmr import mrmr_regression
from xgboost import XGBRegressor

def dataPreparation(df):
    X  = df.drop("SalePrice",axis=1)
    y  = np.log(df["SalePrice"])

    # Hanya gunakan kolom numerik
    X = X.select_dtypes(include=['number'])

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
    feature_list = mrmr_regression(X_train,y_train,K=8)
    print(f"Feature yang berpengaruh terhadap SalePrice :{','.join(feature_list)}")
    X_train  = X_train[feature_list]
    X_test   = X_test[feature_list]
    return X_train, X_test, y_train, y_test   

def processingPipeline():
    pipeline = Pipeline(steps=[
        ('preprocessor', StandardScaler()),
        ('regressor', XGBRegressor(random_state=42))
    ])

    return pipeline

def randomizedSearch(pipeline, X_train, y_train):
   
    param_distributions = {
        'regressor__max_depth': [3, 4, 5, 6,7,8,9,10],
        'regressor__learning_rate': [0.001, 0.01, 0.1],
    }

   
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=10,
        scoring='neg_mean_squared_error',
        cv=5,
        verbose=1,
        random_state=42
    )

  
    random_search.fit(X_train, y_train)

    print(f"Best Parameters: {random_search.best_params_}")
    print(f"Best Score: {random_search.best_score_}")

    return random_search.best_estimator_

def eval(model,X_train,X_test,y_train,y_test):
    pred_train = model.predict(X_train)
    pred_test  = model.predict(X_test)

    print(f"R2 score Train -> {r2_score(y_train,pred_train)}")
    print(f"RMSE Train -> {mean_squared_error(np.exp(y_train),np.exp(pred_train))}\n")
    print(f"R2 score test -> {r2_score(y_test,pred_test)}")
    print(f"RMSE Train -> {mean_squared_error(np.exp(y_test),np.exp(pred_test))}")

In [46]:
df = pd.read_csv("train.csv")
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1379.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,567.240411,1057.429452,1162.626712,346.992466,5.844521,1515.463699,0.425342,0.057534,1.565068,0.382877,2.866438,1.046575,6.517808,0.613014,1978.506164,1.767123,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,441.866955,438.705324,386.587738,436.528436,48.623081,525.480383,0.518911,0.238753,0.550916,0.502885,0.815778,0.220338,1.625393,0.644666,24.689725,0.747315,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,223.0,795.75,882.0,0.0,0.0,1129.5,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1961.0,1.0,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,477.5,991.5,1087.0,0.0,0.0,1464.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1980.0,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,808.0,1298.25,1391.25,728.0,0.0,1776.75,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2002.0,2.0,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,2336.0,6110.0,4692.0,2065.0,572.0,5642.0,3.0,2.0,3.0,2.0,8.0,3.0,14.0,3.0,2010.0,4.0,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [35]:
# Contoh untuk memeriksa tipe data
print(df.dtypes)

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object


In [47]:
X_train, X_test, y_train, y_test = dataPreparation(df)

100%|██████████| 8/8 [00:00<00:00, 21.80it/s]

Feature yang berpengaruh terhadap SalePrice :OverallQual,GrLivArea,GarageCars,YearBuilt,TotalBsmtSF,GarageArea,YearRemodAdd,FullBath





In [48]:
pipeline = processingPipeline()
best_model = randomizedSearch(pipeline, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'regressor__max_depth': 3, 'regressor__learning_rate': 0.1}
Best Score: -0.026525105505628487


In [49]:
eval(best_model, X_train, X_test, y_train, y_test)

R2 score Train -> 0.9173270293352064
RMSE Train -> 459202975.7094229

R2 score test -> 0.8553579695182786
RMSE Train -> 786052584.236904


In [50]:
import pickle
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
