## Import data

In [1]:
import pandas as pd

data = pd.read_csv("vietnam_housing_dataset.csv")

data

Unnamed: 0,Address,Area,Frontage,Access Road,House direction,Balcony direction,Floors,Bedrooms,Bathrooms,Legal status,Furniture state,Price
0,"Dự án The Empire - Vinhomes Ocean Park 2, Xã L...",84.0,,,,,4.0,,,Have certificate,,8.60
1,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",60.0,,,,,5.0,,,,,7.50
2,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",90.0,6.0,13.0,Đông - Bắc,Đông - Bắc,5.0,,,Sale contract,,8.90
3,"Đường Nguyễn Văn Khối, Phường 11, Gò Vấp, Hồ C...",54.0,,3.5,Tây - Nam,Tây - Nam,2.0,2.0,3.0,Have certificate,Full,5.35
4,"Đường Quang Trung, Phường 8, Gò Vấp, Hồ Chí Minh",92.0,,,Đông - Nam,Đông - Nam,2.0,4.0,4.0,Have certificate,Full,6.90
...,...,...,...,...,...,...,...,...,...,...,...,...
30224,"Đường Lê Quang Định, Phường 1, Gò Vấp, Hồ Chí ...",67.0,4.1,16.0,,,1.0,3.0,2.0,Have certificate,,4.60
30225,"Đường Ngô Gia Tự, Phường Đức Giang, Long Biên,...",30.0,,,,,5.0,3.0,3.0,Have certificate,,4.70
30226,"Đường Gò Dưa, Phường Tam Bình, Thủ Đức, Hồ Chí...",69.4,4.0,15.0,Đông - Bắc,Đông - Bắc,,,,Have certificate,Basic,7.50
30227,"Đường Quang Trung, Phường 11, Gò Vấp, Hồ Chí Minh",96.0,,8.0,,,4.0,,,,,9.50


In [2]:
data.isnull().sum()

Address                  0
Area                     0
Frontage             11564
Access Road          13297
House direction      21239
Balcony direction    24983
Floors                3603
Bedrooms              5162
Bathrooms             7074
Legal status          4506
Furniture state      14119
Price                    0
dtype: int64

From this summary, I will decide that Frontage are numberic, and if they are null, I will replace 0.

For Access Road, Floors, Bedrooms, Bathrooms, I will use median for null

For House direction, Balcony direction, Legal status, Furniture state will be applied one hot encoding and if they are null, I will replace Unknown

In [3]:
data.groupby('Furniture state').count()

Unnamed: 0_level_0,Address,Area,Frontage,Access Road,House direction,Balcony direction,Floors,Bedrooms,Bathrooms,Legal status,Price
Furniture state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Basic,5519,5519,3810,3677,2094,1354,5085,5116,4927,5342,5519
Full,10591,10591,7114,6634,2958,1982,10165,9931,9379,10206,10591


So we will have 2 types of Furniture state, and I will replace as 0 and 1

## Data cleaning

In [4]:
data2 = data

### Get district
The price depends on the District

In [5]:
def get_district(address):
    parts = str(address).split(',')
    if len(parts) >= 2:
        return parts[-2].strip()
    else:
        return "Unknown"



data2['District'] = data2.Address.apply(get_district)

district_count = data2.District.value_counts()
filter_district = district_count[district_count <= 50]

def filter_dis(address):
    if address in filter_district:
        return 'Other'
    else:
        return address

data2['District'] = data2.District.apply(filter_dis)
data2.District.value_counts()

District
Other        1754
Gò Vấp       1373
Hà Đông      1285
Long Biên    1242
Bình Tân     1159
             ... 
Hạ Long        64
Quận 5         60
Ninh Kiều      55
Vĩnh Cửu       52
Hoàn Kiếm      51
Name: count, Length: 75, dtype: int64

In [6]:
data3 = data2

### Handle null value

In [7]:
data3['House direction'] = data3['House direction'].fillna("Unknown")
data3['Balcony direction'] = data3['Balcony direction'].fillna("Unknown")
data3['Legal status'] = data3['Legal status'].fillna("Unknown")
data3['Furniture state'] = data3['Furniture state'].fillna("Unknown")
data3['Frontage'] = data3['Frontage'].fillna(0)
data3['Access Road'] = data3['Access Road'].fillna(data3['Access Road'].median())
data3['Floors'] = data3['Floors'].fillna(data3['Floors'].median())
data3['Bedrooms'] = data3['Bedrooms'].fillna(data3['Bedrooms'].median())
data3['Bathrooms'] = data3['Bathrooms'].fillna(data3['Bathrooms'].median())
data3

Unnamed: 0,Address,Area,Frontage,Access Road,House direction,Balcony direction,Floors,Bedrooms,Bathrooms,Legal status,Furniture state,Price,District
0,"Dự án The Empire - Vinhomes Ocean Park 2, Xã L...",84.0,0.0,6.0,Unknown,Unknown,4.0,3.0,3.0,Have certificate,Unknown,8.60,Văn Giang
1,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",60.0,0.0,6.0,Unknown,Unknown,5.0,3.0,3.0,Unknown,Unknown,7.50,Văn Giang
2,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",90.0,6.0,13.0,Đông - Bắc,Đông - Bắc,5.0,3.0,3.0,Sale contract,Unknown,8.90,Văn Giang
3,"Đường Nguyễn Văn Khối, Phường 11, Gò Vấp, Hồ C...",54.0,0.0,3.5,Tây - Nam,Tây - Nam,2.0,2.0,3.0,Have certificate,Full,5.35,Gò Vấp
4,"Đường Quang Trung, Phường 8, Gò Vấp, Hồ Chí Minh",92.0,0.0,6.0,Đông - Nam,Đông - Nam,2.0,4.0,4.0,Have certificate,Full,6.90,Gò Vấp
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30224,"Đường Lê Quang Định, Phường 1, Gò Vấp, Hồ Chí ...",67.0,4.1,16.0,Unknown,Unknown,1.0,3.0,2.0,Have certificate,Unknown,4.60,Gò Vấp
30225,"Đường Ngô Gia Tự, Phường Đức Giang, Long Biên,...",30.0,0.0,6.0,Unknown,Unknown,5.0,3.0,3.0,Have certificate,Unknown,4.70,Long Biên
30226,"Đường Gò Dưa, Phường Tam Bình, Thủ Đức, Hồ Chí...",69.4,4.0,15.0,Đông - Bắc,Đông - Bắc,3.0,3.0,3.0,Have certificate,Basic,7.50,Thủ Đức
30227,"Đường Quang Trung, Phường 11, Gò Vấp, Hồ Chí Minh",96.0,0.0,8.0,Unknown,Unknown,4.0,3.0,3.0,Unknown,Unknown,9.50,Gò Vấp


### One hot encoding

In [8]:
data4 = pd.get_dummies(data3, columns=['House direction', 'Balcony direction', 'Legal status', 'District', 'Furniture state'], dtype=int, drop_first=1)
data4

Unnamed: 0,Address,Area,Frontage,Access Road,Floors,Bedrooms,Bathrooms,Price,House direction_Nam,House direction_Tây,...,District_Từ Sơn,District_Văn Giang,District_Vĩnh Cửu,District_Vũng Tàu,District_Đà Lạt,District_Đông Anh,District_Đống Đa,District_Đức Hòa,Furniture state_Full,Furniture state_Unknown
0,"Dự án The Empire - Vinhomes Ocean Park 2, Xã L...",84.0,0.0,6.0,4.0,3.0,3.0,8.60,0,0,...,0,1,0,0,0,0,0,0,0,1
1,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",60.0,0.0,6.0,5.0,3.0,3.0,7.50,0,0,...,0,1,0,0,0,0,0,0,0,1
2,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",90.0,6.0,13.0,5.0,3.0,3.0,8.90,0,0,...,0,1,0,0,0,0,0,0,0,1
3,"Đường Nguyễn Văn Khối, Phường 11, Gò Vấp, Hồ C...",54.0,0.0,3.5,2.0,2.0,3.0,5.35,0,0,...,0,0,0,0,0,0,0,0,1,0
4,"Đường Quang Trung, Phường 8, Gò Vấp, Hồ Chí Minh",92.0,0.0,6.0,2.0,4.0,4.0,6.90,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30224,"Đường Lê Quang Định, Phường 1, Gò Vấp, Hồ Chí ...",67.0,4.1,16.0,1.0,3.0,2.0,4.60,0,0,...,0,0,0,0,0,0,0,0,0,1
30225,"Đường Ngô Gia Tự, Phường Đức Giang, Long Biên,...",30.0,0.0,6.0,5.0,3.0,3.0,4.70,0,0,...,0,0,0,0,0,0,0,0,0,1
30226,"Đường Gò Dưa, Phường Tam Bình, Thủ Đức, Hồ Chí...",69.4,4.0,15.0,3.0,3.0,3.0,7.50,0,0,...,0,0,0,0,0,0,0,0,0,0
30227,"Đường Quang Trung, Phường 11, Gò Vấp, Hồ Chí Minh",96.0,0.0,8.0,4.0,3.0,3.0,9.50,0,0,...,0,0,0,0,0,0,0,0,0,1


### Remove outliner

Using quantile

In [9]:
data4.describe()

Unnamed: 0,Area,Frontage,Access Road,Floors,Bedrooms,Bathrooms,Price,House direction_Nam,House direction_Tây,House direction_Tây - Bắc,...,District_Từ Sơn,District_Văn Giang,District_Vĩnh Cửu,District_Vũng Tàu,District_Đà Lạt,District_Đông Anh,District_Đống Đa,District_Đức Hòa,Furniture state_Full,Furniture state_Unknown
count,30229.0,30229.0,30229.0,30229.0,30229.0,30229.0,30229.0,30229.0,30229.0,30229.0,...,30229.0,30229.0,30229.0,30229.0,30229.0,30229.0,30229.0,30229.0,30229.0,30229.0
mean,68.498741,3.310595,7.038359,3.361507,3.423765,3.265672,5.872078,0.03447,0.026167,0.037216,...,0.004301,0.011545,0.00172,0.004234,0.002283,0.003937,0.032518,0.002746,0.350359,0.467068
std,48.069835,4.29576,5.652009,1.254256,1.20752,1.234208,2.211877,0.182437,0.159634,0.189294,...,0.065438,0.106828,0.04144,0.064935,0.047723,0.06262,0.177375,0.052328,0.47709,0.498923
min,3.1,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,0.0,5.0,2.0,3.0,3.0,4.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,56.0,4.0,6.0,3.0,3.0,3.0,5.9,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,80.0,5.0,6.0,4.0,4.0,4.0,7.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,595.0,77.0,85.0,10.0,9.0,9.0,11.5,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
data5 = data4

In [11]:
max_threshold, min_threshold = data5.Area.quantile([0.999, 0.001])

data5 = data5[(data5.Area <= max_threshold) & (data5.Area >= min_threshold)]
data5.Area.describe()

count    30172.000000
mean        68.099566
std         45.785718
min         12.000000
25%         40.000000
50%         56.000000
75%         80.000000
max        500.000000
Name: Area, dtype: float64

In [12]:
max_threshold, min_threshold = data5.Frontage.quantile([0.999, 0.001])

data5 = data5[(data5.Frontage <= max_threshold) & (data5.Frontage >= min_threshold)]
data5.Frontage.describe()

count    30142.000000
mean         3.247695
std          3.879791
min          0.000000
25%          0.000000
50%          4.000000
75%          5.000000
max         51.000000
Name: Frontage, dtype: float64

In [13]:
max_threshold, min_threshold = data5['Access Road'].quantile([0.999, 0.001])

data5 = data5[(data5['Access Road'] <= max_threshold) & (data5['Access Road'] >= min_threshold)]
data5['Access Road'].describe()

count    30082.000000
mean         6.972330
std          5.247557
min          1.800000
25%          5.000000
50%          6.000000
75%          6.000000
max         61.000000
Name: Access Road, dtype: float64

In [14]:
max_threshold, min_threshold = data5['Floors'].quantile([0.999, 0.001])

data5 = data5[(data5['Floors'] <= max_threshold) & (data5['Floors'] >= min_threshold)]
data5['Floors'].describe()

count    30077.000000
mean         3.361771
std          1.251925
min          1.000000
25%          2.000000
50%          3.000000
75%          4.000000
max          7.000000
Name: Floors, dtype: float64

In [15]:
max_threshold, min_threshold = data5['Bedrooms'].quantile([0.999, 0.001])

data5 = data5[(data5['Bedrooms'] <= max_threshold) & (data5['Bedrooms'] >= min_threshold)]
data5['Bedrooms'].describe()

count    30077.000000
mean         3.425541
std          1.205736
min          1.000000
25%          3.000000
50%          3.000000
75%          4.000000
max          9.000000
Name: Bedrooms, dtype: float64

In [16]:
max_threshold, min_threshold = data5['Bathrooms'].quantile([0.999, 0.001])

data5 = data5[(data5['Bathrooms'] <= max_threshold) & (data5['Bathrooms'] >= min_threshold)]
data5['Bathrooms'].describe()

count    30077.000000
mean         3.267114
std          1.232026
min          1.000000
25%          3.000000
50%          3.000000
75%          4.000000
max          9.000000
Name: Bathrooms, dtype: float64

In [17]:
data5.describe()

Unnamed: 0,Area,Frontage,Access Road,Floors,Bedrooms,Bathrooms,Price,House direction_Nam,House direction_Tây,House direction_Tây - Bắc,...,District_Từ Sơn,District_Văn Giang,District_Vĩnh Cửu,District_Vũng Tàu,District_Đà Lạt,District_Đông Anh,District_Đống Đa,District_Đức Hòa,Furniture state_Full,Furniture state_Unknown
count,30077.0,30077.0,30077.0,30077.0,30077.0,30077.0,30077.0,30077.0,30077.0,30077.0,...,30077.0,30077.0,30077.0,30077.0,30077.0,30077.0,30077.0,30077.0,30077.0,30077.0
mean,68.063966,3.241246,6.972492,3.361771,3.425541,3.267114,5.876077,0.034245,0.026166,0.037038,...,0.004256,0.011604,0.001729,0.004256,0.002294,0.00389,0.03255,0.002726,0.350467,0.466802
std,45.744078,3.867897,5.247978,1.251925,1.205736,1.232026,2.208814,0.181862,0.159632,0.188859,...,0.065098,0.107095,0.041545,0.065098,0.047843,0.06225,0.177458,0.052144,0.477124,0.498905
min,12.0,0.0,1.8,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,0.0,5.0,2.0,3.0,3.0,4.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,56.0,4.0,6.0,3.0,3.0,3.0,5.9,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,80.0,5.0,6.0,4.0,4.0,4.0,7.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,500.0,51.0,61.0,7.0,9.0,9.0,11.5,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
data6 = data5

Using Z score

In [19]:
data6 = data6[( data6['Area'] <= data6['Area'].mean() + 3*data6['Area'].std() ) & 
    ( data6['Area'] >= data6['Area'].mean() - 3*data6['Area'].std() ) &
    
    (data6['Frontage'] <= data6['Frontage'].mean() + 3*data6['Frontage'].std() ) & 
    ( data6['Frontage'] >= data6['Frontage'].mean() - 3*data6['Frontage'].std() ) &

    (data6['Access Road'] <= data6['Access Road'].mean() + 3*data6['Access Road'].std() ) & 
    ( data6['Access Road'] >= data6['Access Road'].mean() - 3*data6['Access Road'].std() ) &

    (data6['Floors'] <= data6['Floors'].mean() + 3*data6['Floors'].std() ) & 
    ( data6['Floors'] >= data6['Floors'].mean() - 3*data6['Floors'].std() ) &

    (data6['Bedrooms'] <= data6['Bedrooms'].mean() + 3*data6['Bedrooms'].std() ) & 
    ( data6['Bedrooms'] >= data6['Bedrooms'].mean() - 3*data6['Bedrooms'].std() ) &

    (data6['Bathrooms'] <= data6['Bathrooms'].mean() + 3*data6['Bathrooms'].std() ) & 
    ( data6['Bathrooms'] >= data6['Bathrooms'].mean() - 3*data6['Bathrooms'].std() )]

Add Total_Usable_Area column for more accuracy in training model

In [20]:
data6 = data6.copy()
data6['Total_Usable_Area'] = data6['Area'] * data6['Floors']
data6

Unnamed: 0,Address,Area,Frontage,Access Road,Floors,Bedrooms,Bathrooms,Price,House direction_Nam,House direction_Tây,...,District_Văn Giang,District_Vĩnh Cửu,District_Vũng Tàu,District_Đà Lạt,District_Đông Anh,District_Đống Đa,District_Đức Hòa,Furniture state_Full,Furniture state_Unknown,Total_Usable_Area
0,"Dự án The Empire - Vinhomes Ocean Park 2, Xã L...",84.0,0.0,6.0,4.0,3.0,3.0,8.60,0,0,...,1,0,0,0,0,0,0,0,1,336.0
1,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",60.0,0.0,6.0,5.0,3.0,3.0,7.50,0,0,...,1,0,0,0,0,0,0,0,1,300.0
2,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",90.0,6.0,13.0,5.0,3.0,3.0,8.90,0,0,...,1,0,0,0,0,0,0,0,1,450.0
3,"Đường Nguyễn Văn Khối, Phường 11, Gò Vấp, Hồ C...",54.0,0.0,3.5,2.0,2.0,3.0,5.35,0,0,...,0,0,0,0,0,0,0,1,0,108.0
4,"Đường Quang Trung, Phường 8, Gò Vấp, Hồ Chí Minh",92.0,0.0,6.0,2.0,4.0,4.0,6.90,0,0,...,0,0,0,0,0,0,0,1,0,184.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30224,"Đường Lê Quang Định, Phường 1, Gò Vấp, Hồ Chí ...",67.0,4.1,16.0,1.0,3.0,2.0,4.60,0,0,...,0,0,0,0,0,0,0,0,1,67.0
30225,"Đường Ngô Gia Tự, Phường Đức Giang, Long Biên,...",30.0,0.0,6.0,5.0,3.0,3.0,4.70,0,0,...,0,0,0,0,0,0,0,0,1,150.0
30226,"Đường Gò Dưa, Phường Tam Bình, Thủ Đức, Hồ Chí...",69.4,4.0,15.0,3.0,3.0,3.0,7.50,0,0,...,0,0,0,0,0,0,0,0,0,208.2
30227,"Đường Quang Trung, Phường 11, Gò Vấp, Hồ Chí Minh",96.0,0.0,8.0,4.0,3.0,3.0,9.50,0,0,...,0,0,0,0,0,0,0,0,1,384.0


In [21]:
data6.head()

Unnamed: 0,Address,Area,Frontage,Access Road,Floors,Bedrooms,Bathrooms,Price,House direction_Nam,House direction_Tây,...,District_Văn Giang,District_Vĩnh Cửu,District_Vũng Tàu,District_Đà Lạt,District_Đông Anh,District_Đống Đa,District_Đức Hòa,Furniture state_Full,Furniture state_Unknown,Total_Usable_Area
0,"Dự án The Empire - Vinhomes Ocean Park 2, Xã L...",84.0,0.0,6.0,4.0,3.0,3.0,8.6,0,0,...,1,0,0,0,0,0,0,0,1,336.0
1,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",60.0,0.0,6.0,5.0,3.0,3.0,7.5,0,0,...,1,0,0,0,0,0,0,0,1,300.0
2,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",90.0,6.0,13.0,5.0,3.0,3.0,8.9,0,0,...,1,0,0,0,0,0,0,0,1,450.0
3,"Đường Nguyễn Văn Khối, Phường 11, Gò Vấp, Hồ C...",54.0,0.0,3.5,2.0,2.0,3.0,5.35,0,0,...,0,0,0,0,0,0,0,1,0,108.0
4,"Đường Quang Trung, Phường 8, Gò Vấp, Hồ Chí Minh",92.0,0.0,6.0,2.0,4.0,4.0,6.9,0,0,...,0,0,0,0,0,0,0,1,0,184.0


In [22]:
X = data6.drop(['Price','Address'], axis=1)
y = data6['Price']

### Hyperparameter Tuning

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit
import numpy as np

def score_model(X,y):
    model_params = {
        'LinearRegression': {
            'model': LinearRegression(),
            'params': {
            }
        },
        'Lasso': {
            'model': Pipeline([
                ('scaler', StandardScaler()),
                ('regressor', Lasso())]),
            'params': {
                'regressor__alpha': [0.01, 0.1, 1, 10, 50, 100]
            }
        },
        'Ridge': {
            'model': Pipeline([
                ('scaler', StandardScaler()),
                ('regressor', Ridge())]),
            'params': {
                'regressor__alpha': [0.01, 0.1, 1, 10, 50, 100]
            }
        },
        'RandomForest': {
            'model': RandomForestRegressor(n_jobs=-1),
            'params': {
                'n_estimators': [50, 200], # Số cây
                'max_depth': [10, 20, None], 
            }
        }
    }
    cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    score = []
    for name, param in model_params.items():
        clf = GridSearchCV(param['model'], param['params'], cv=cv)
        y_log = np.log1p(y) 
        clf.fit(X,y_log)
        score.append({
            'model': name,
            'best_score': clf.best_score_,
            'best_param': clf.best_params_
        })
    return pd.DataFrame(score, columns  = ['model', 'best_score', 'best_param'])
score_model(X,y)

Unnamed: 0,model,best_score,best_param
0,LinearRegression,0.549995,{}
1,Lasso,0.465045,{'regressor__alpha': 0.01}
2,Ridge,0.549998,{'regressor__alpha': 1}
3,RandomForest,0.617826,"{'max_depth': None, 'n_estimators': 200}"


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test) 

model = RandomForestRegressor(
    n_estimators=200,    
    max_depth=None,         
    max_features='sqrt', 
    random_state=42,
    n_jobs=-1
)
model.fit(X_train,y_train_log)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",200
,"criterion  criterion: {""squared_error"", ""absolute_error"", ""friedman_mse"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in Poisson deviance to find splits. Training using ""absolute_error"" is significantly slower than when using ""squared_error"". .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 1.0  Poisson criterion.",'squared_error'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None or 1.0, then `max_features=n_features`. .. note::  The default of 1.0 is equivalent to bagged trees and more  randomness can be achieved by setting smaller values, e.g. 0.3. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to 1.0. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [25]:
print(model.score(X_train, y_train_log), model.score(X_test, y_test_log))

0.9421839038284008 0.6187592467886436


In [26]:
from sklearn.metrics import mean_absolute_percentage_error

y_pred_log = model.predict(X_test)
y_pred_real = np.expm1(y_pred_log) 

my_mape = mean_absolute_percentage_error(y_test, y_pred_real) * 100
print(f"MAPE: {my_mape:.2f}%")

MAPE: 21.41%


### Price prediction

In [27]:
X.columns

Index(['Area', 'Frontage', 'Access Road', 'Floors', 'Bedrooms', 'Bathrooms',
       'House direction_Nam', 'House direction_Tây',
       'House direction_Tây - Bắc', 'House direction_Tây - Nam',
       ...
       'District_Văn Giang', 'District_Vĩnh Cửu', 'District_Vũng Tàu',
       'District_Đà Lạt', 'District_Đông Anh', 'District_Đống Đa',
       'District_Đức Hòa', 'Furniture state_Full', 'Furniture state_Unknown',
       'Total_Usable_Area'],
      dtype='object', length=101)

In [28]:
loc_index = np.where(X.columns == 'District_Vĩnh Cửu')[0][0]
loc_index

np.int64(92)

In [29]:
np.zeros(len(X.columns))

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [30]:
def predict_price(area, frontage, access_road, floors, bedrooms, bathrooms, _direction, _balcony, _district):
    direction = "House direction_" + _direction
    district = "District_" + _district
    balcony = "Balcony direction_" + _balcony
    
    #district_index = np.where(X.columns == district)[0][0]
    #direction_index = np.where(X.columns == direction)[0][0]

    x_query = np.zeros(len(X.columns))

    x_query[0] = area
    x_query[1] = frontage
    x_query[2] = access_road
    x_query[3] = floors
    x_query[4] = bedrooms
    x_query[5] = bathrooms
    
    try:
        district_index = np.where(X.columns == district)[0][0]
        x_query[district_index] = 1
    except IndexError:
        print(f"Warning: '{district}' not found in training data. Ignoring.")

    try:
        direction_index = np.where(X.columns == direction)[0][0]
        x_query[direction_index] = 1
    except IndexError:
        print(f"Warning: '{direction}' not found in training data. Ignoring.")

    try:
        balcony_index = np.where(X.columns == balcony)[0][0]
        x_query[balcony_index] = 1
    except IndexError:
        print(f"Warning: '{balcony}' not found in training data. Ignoring.")
        
    x_df = pd.DataFrame([x_query], columns=X.columns)
    return model.predict(x_df)[0]

In [36]:
predict_price(5600, 5, 3, 2, 3, 5, 'Nam', 'Nam', 'Đà Lạt')

np.float64(1.620590801674132)

In [32]:
predict_price(5600, 5, 3, 7, 3, 5, 'Tây - Bắc', 'Nam', 'Đà Lạt')

np.float64(1.7692032430765787)

## Export model

In [33]:
import pickle
with open('vietnam_housing_price.pickle', 'wb') as f:
    pickle.dump(model, f)

In [38]:
import json

columns = {
    'data_columns': [col for col in X.columns]
}

with open('dataset_columns.json', 'w') as f:
    json.dump(columns, f)