## Step 1. Import Dataset

In [52]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
data = pd.read_csv('./data/train.csv')
data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [54]:
data = pd.read_csv('./data/train.csv', index_col='Id')
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## Step 2. Data Preprocessing
### 2.1. Feature Selection

- Data Preprocessing: just filter the data which used and process data to be can use in Model Machine Learning
    * Select Features, (Done)
    * Splitting X, y
    * Data Imputation (Missing Data Replacing), (Done)
    * Splitting X_train, y_train, X_test, y_test
           
- Encode Categorical Data and Feature Scaling

- Training Model

- Model Evaluation

In [55]:
# Use Feature Engineering to select features
features = ["LotArea", "Street", "YearBuilt", "HouseStyle", "Electrical", "1stFlrSF", "2ndFlrSF",
            "FullBath", "BedroomAbvGr", "TotRmsAbvGrd"]


### 2.2. Splitting dataset into X, y

In [56]:
X = data[features]
y = data['SalePrice']

### 2.3. Data Imputation (Missing Data Replacement)

In [57]:
# Finding columns with missing data
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LotArea       1460 non-null   int64 
 1   Street        1460 non-null   object
 2   YearBuilt     1460 non-null   int64 
 3   HouseStyle    1460 non-null   object
 4   Electrical    1459 non-null   object
 5   1stFlrSF      1460 non-null   int64 
 6   2ndFlrSF      1460 non-null   int64 
 7   FullBath      1460 non-null   int64 
 8   BedroomAbvGr  1460 non-null   int64 
 9   TotRmsAbvGrd  1460 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 125.5+ KB


In [58]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
X.loc[:,'Electrical'] = imputer.fit_transform(X[['Electrical']])[:,0]

X

Unnamed: 0_level_0,LotArea,Street,YearBuilt,HouseStyle,Electrical,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,8450,Pave,2003,2Story,SBrkr,856,854,2,3,8
2,9600,Pave,1976,1Story,SBrkr,1262,0,2,3,6
3,11250,Pave,2001,2Story,SBrkr,920,866,2,3,6
4,9550,Pave,1915,2Story,SBrkr,961,756,1,3,7
5,14260,Pave,2000,2Story,SBrkr,1145,1053,2,4,9
...,...,...,...,...,...,...,...,...,...,...
1456,7917,Pave,1999,2Story,SBrkr,953,694,2,3,7
1457,13175,Pave,1978,1Story,SBrkr,2073,0,2,3,7
1458,9042,Pave,1941,2Story,SBrkr,1188,1152,2,4,9
1459,9717,Pave,1950,1Story,FuseA,1078,0,1,2,5


### 2.4. X, y → X_train, y_train, X_valid, y_valid

In [59]:
from sklearn.model_selection import train_test_split

# Use random_state ~ seed()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [60]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape
print(X_train)

      LotArea Street  YearBuilt HouseStyle Electrical  1stFlrSF  2ndFlrSF  \
Id                                                                          
619     11694   Pave       2007     1Story      SBrkr      1828         0   
871      6600   Pave       1962     1Story      SBrkr       894         0   
93      13360   Pave       1921     1Story      SBrkr       964         0   
818     13265   Pave       2002     1Story      SBrkr      1689         0   
303     13704   Pave       2001     1Story      SBrkr      1541         0   
...       ...    ...        ...        ...        ...       ...       ...   
764      9430   Pave       1999     2Story      SBrkr      1268      1097   
836      9600   Pave       1950     1Story      SBrkr      1067         0   
1217     8930   Pave       1978     1.5Fin      SBrkr      1318       584   
560      3196   Pave       2003     1Story      SBrkr      1557         0   
685     16770   Pave       1998     2Story      SBrkr      1195       644   

## Step 3. Encode Categorical Data and Feature Scaling

In [63]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols  = X.select_dtypes(include=['int64']).columns

categorical_cols_indicies = [X.columns.get_loc(col) for col in categorical_cols]
numeric_cols_indicies = [X.columns.get_loc(col) for col in numeric_cols]

ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), categorical_cols_indicies),
                  ('num', StandardScaler(), numeric_cols_indicies)],
    remainder="passthrough"
)

X_train = ct.fit_transform(X_train)
X_valid = ct.transform(X_valid)


## Step 4. Traning Machine Learning Model

In [202]:
    from sklearn.ensemble import GradientBoostingRegressor

rf_model = GradientBoostingRegressor(random_state=1, n_estimators=210)
# rf_model = svm.SVR()
rf_model.fit(X_train, y_train)

In [203]:
rf_val_preds = rf_model.predict(X_valid)

In [204]:
rf_val_preds[:10], y_valid[: 10]

(array([235905.5464565 , 150998.91333717, 121362.39515738, 201039.7382342 ,
         89436.60396994, 102382.98278675, 240769.45238634, 131893.7276789 ,
        756821.96443313, 150401.10348472]),
 Id
 530     200624
 492     133000
 460     110000
 280     192000
 656      88000
 1014     85000
 1404    282922
 602     141000
 1183    745000
 688     148800
 Name: SalePrice, dtype: int64)

In [205]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2 = r2_score(y_valid, rf_val_preds)
mae = mean_absolute_error(y_valid, rf_val_preds)
print(f"Mean Absolute Error (MAE): {mae}")

Mean Absolute Error (MAE): 21302.192513974223


In [198]:
print(f"R-squared (R²): {r2}")

R-squared (R²): 0.8606627281611235
