In [63]:
import pandas as pd
import numpy as np
import re

In [64]:
df = pd.read_csv("rumah123_yogya_unfiltered.csv")
df.head()

Unnamed: 0,price,nav-link,description,listing-location,bed,bath,carport,surface_area,building_area
0,"Rp 1,79 Miliar",https://www.rumah123.com/properti/sleman/hos17...,Rumah 2 Lantai Baru di jalan Palagan Sleman Y...,"Ngaglik, Sleman",3.0,3.0,2.0,120 m²,110 m²
1,Rp 170 Juta,https://www.rumah123.com/properti/sleman/hos17...,RUMAH BARU DEKAT AL AZHAR DAN UGM,"Jombor, Sleman",3.0,2.0,1.0,102 m²,126 m²
2,Rp 695 Juta,https://www.rumah123.com/properti/sleman/hos17...,RUMAH ASRI DAN SEJUK DI BERBAH SLEMAN DEKAT PA...,"Berbah, Sleman",2.0,2.0,1.0,100 m²,100 m²
3,Rp 560 Juta,https://www.rumah123.com/properti/sleman/hos17...,Rumah Murah 5 Menit Dari Candi Prambanan Tersi...,"Prambanan, Sleman",3.0,1.0,1.0,109 m²,67 m²
4,Rp 200 Juta,https://www.rumah123.com/properti/sleman/hos17...,Rumah Murah Cicilan 1jt Di Moyudan Sleman,"Moyudan, Sleman",2.0,1.0,1.0,60 m²,30 m²


In [65]:
df.shape

(2020, 9)

# Cleaning data

In [66]:
df.isna().sum()

price                 0
nav-link              0
description           0
listing-location      0
bed                  19
bath                 21
carport             307
surface_area          1
building_area         1
dtype: int64

## Missing value handle

In [67]:
df = df.dropna(subset=['bed', 'bath', 'surface_area', 'building_area'])
df.isna().sum(), df.shape

(price                 0
 nav-link              0
 description           0
 listing-location      0
 bed                   0
 bath                  0
 carport             285
 surface_area          0
 building_area         0
 dtype: int64,
 (1998, 9))

In [68]:
df['carport'] = df['carport'].fillna(0)
df.isna().sum(), df.shape

(price               0
 nav-link            0
 description         0
 listing-location    0
 bed                 0
 bath                0
 carport             0
 surface_area        0
 building_area       0
 dtype: int64,
 (1998, 9))

Delete space

In [69]:
df['listing-location'] = [re.sub(r'\s+', ' ', i).strip() for i in df['listing-location']]

In [70]:
df['listing-location'].unique()

array(['Ngaglik, Sleman', 'Jombor, Sleman', 'Berbah, Sleman',
       'Prambanan, Sleman', 'Moyudan, Sleman', 'Depok, Sleman',
       'Gamping, Sleman', 'Kaliurang, Yogyakarta', 'Sedayu, Bantul',
       'Ngemplak, Sleman', 'Piyungan, Bantul', 'Umbulharjo, Yogyakarta',
       'Godean, Sleman', 'Mlati, Sleman', 'Condong Catur, Sleman',
       'Kasihan, Bantul', 'Bantul, Bantul', 'Sleman, Sleman',
       'Sewon, Bantul', 'Kalasan, Sleman', 'Plered, Bantul',
       'Sleman, Yogyakarta', 'Maguwoharjo, Yogyakarta',
       'Demangan, Yogyakarta', 'Purwomartani , Sleman', 'Minggir, Sleman',
       'Gondokusuman, Yogyakarta', 'Kotagede, Yogyakarta', 'Turi, Sleman',
       'Kaliurang, Sleman', 'Pogung, Yogyakarta',
       'Mantrijeron, Yogyakarta', 'Cebongan, Sleman',
       'Pakualaman, Yogyakarta', 'Bantul, Yogyakarta', 'Sayegan, Sleman',
       'Danurejan, Yogyakarta', 'Wirobrajan, Yogyakarta',
       'Banguntapan, Bantul', 'Seturan, Yogyakarta', 'Pakem, Sleman',
       'Caturtunggal, Sleman',

In [71]:
df = df[['price', 'listing-location', 'bed', 'bath', 'carport', 'surface_area', 'building_area']]
df.head()

Unnamed: 0,price,listing-location,bed,bath,carport,surface_area,building_area
0,"Rp 1,79 Miliar","Ngaglik, Sleman",3.0,3.0,2.0,120 m²,110 m²
1,Rp 170 Juta,"Jombor, Sleman",3.0,2.0,1.0,102 m²,126 m²
2,Rp 695 Juta,"Berbah, Sleman",2.0,2.0,1.0,100 m²,100 m²
3,Rp 560 Juta,"Prambanan, Sleman",3.0,1.0,1.0,109 m²,67 m²
4,Rp 200 Juta,"Moyudan, Sleman",2.0,1.0,1.0,60 m²,30 m²


In [72]:
df.dtypes

price                object
listing-location     object
bed                 float64
bath                float64
carport             float64
surface_area         object
building_area        object
dtype: object

## Price to numeric

In [73]:
def convert_price(price_str):
    price_str = price_str.replace("Rp ", "")

    if "Miliar" in price_str:
        number = float(re.sub("[^0-9,]", "", price_str).replace(",",".")) * 1_000_000_000
    elif "Juta" in price_str:
        number = float(re.sub("[^0-9,]", "", price_str).replace(",",".")) * 1_000_000
    else:
        number = float(re.sub("[^0-9,]", "", price_str).replace(",","."))

    return int(number)

In [74]:
df['price'] = df['price'].apply(convert_price)
df.head()

Unnamed: 0,price,listing-location,bed,bath,carport,surface_area,building_area
0,1790000000,"Ngaglik, Sleman",3.0,3.0,2.0,120 m²,110 m²
1,170000000,"Jombor, Sleman",3.0,2.0,1.0,102 m²,126 m²
2,695000000,"Berbah, Sleman",2.0,2.0,1.0,100 m²,100 m²
3,560000000,"Prambanan, Sleman",3.0,1.0,1.0,109 m²,67 m²
4,200000000,"Moyudan, Sleman",2.0,1.0,1.0,60 m²,30 m²


In [75]:
df.dtypes

price                 int64
listing-location     object
bed                 float64
bath                float64
carport             float64
surface_area         object
building_area        object
dtype: object

## Convert bed, bath, and carport to int

In [76]:
def convert_to_int(data):
    data = data.astype(int)
    return data

In [77]:
df[['bed', 'bath', 'carport']] = df[['bed', 'bath', 'carport']].apply(convert_to_int)
df.head()

Unnamed: 0,price,listing-location,bed,bath,carport,surface_area,building_area
0,1790000000,"Ngaglik, Sleman",3,3,2,120 m²,110 m²
1,170000000,"Jombor, Sleman",3,2,1,102 m²,126 m²
2,695000000,"Berbah, Sleman",2,2,1,100 m²,100 m²
3,560000000,"Prambanan, Sleman",3,1,1,109 m²,67 m²
4,200000000,"Moyudan, Sleman",2,1,1,60 m²,30 m²


In [78]:
def convert_area(area_str):
    return int(area_str.replace("m²", "").strip())

In [79]:
df['surface_area'] = df['surface_area'].apply(convert_area)
df['building_area'] = df['building_area'].apply(convert_area)
df.head(), df.dtypes

(        price   listing-location  bed  bath  carport  surface_area  \
 0  1790000000    Ngaglik, Sleman    3     3        2           120   
 1   170000000     Jombor, Sleman    3     2        1           102   
 2   695000000     Berbah, Sleman    2     2        1           100   
 3   560000000  Prambanan, Sleman    3     1        1           109   
 4   200000000    Moyudan, Sleman    2     1        1            60   
 
    building_area  
 0            110  
 1            126  
 2            100  
 3             67  
 4             30  ,
 price                int64
 listing-location    object
 bed                  int64
 bath                 int64
 carport              int64
 surface_area         int64
 building_area        int64
 dtype: object)

## encoding listing-location

In [81]:
df = pd.get_dummies(df, columns=['listing-location'], drop_first=True)
df.head()

Unnamed: 0,price,bed,bath,carport,surface_area,building_area,"listing-location_Banguntapan, Bantul","listing-location_Bantul, Bantul","listing-location_Bantul, Yogyakarta","listing-location_Berbah, Sleman",...,"listing-location_Sidoarum , Sleman","listing-location_Sleman, Sleman","listing-location_Sleman, Yogyakarta","listing-location_Tegalrejo, Yogyakarta","listing-location_Tempel, Sleman","listing-location_Turi, Sleman","listing-location_Umbulharjo, Yogyakarta","listing-location_Wates, Kulon Progo","listing-location_Wirobrajan, Yogyakarta","listing-location_Wonosari, Gunung Kidul"
0,1790000000,3,3,2,120,110,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,170000000,3,2,1,102,126,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,695000000,2,2,1,100,100,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,560000000,3,1,1,109,67,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,200000000,2,1,1,60,30,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Modeling

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [85]:
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((1598, 73), (400, 73))

In [86]:
model = LinearRegression()
model.fit(X_train, y_train)

In [87]:
y_pred = model.predict(X_test)

In [88]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 3.9339844460891295e+18


In [90]:
import joblib

joblib.dump(model, "model.pkl")

['model.pkl']

In [91]:
new_data = {
    'listing-location': ["Kaliurang, Sleman"],  # Input untuk fitur yang di-OHE
    'bed': [2],
    'bath': [1],
    'carport': [1],
    'surface_area': [100],
    'building_area': [90]
}

# Membuat DataFrame dari input baru
new_df = pd.DataFrame(new_data)

new_df = pd.get_dummies(new_df, columns=['listing-location'])

In [100]:
cols_train = df.columns.tolist()
cols_train.remove('price')
new_df = new_df.reindex(columns=cols_train, fill_value=0)
new_df.head()

Unnamed: 0,bed,bath,carport,surface_area,building_area,"listing-location_Banguntapan, Bantul","listing-location_Bantul, Bantul","listing-location_Bantul, Yogyakarta","listing-location_Berbah, Sleman","listing-location_Caturtunggal, Sleman",...,"listing-location_Sidoarum , Sleman","listing-location_Sleman, Sleman","listing-location_Sleman, Yogyakarta","listing-location_Tegalrejo, Yogyakarta","listing-location_Tempel, Sleman","listing-location_Turi, Sleman","listing-location_Umbulharjo, Yogyakarta","listing-location_Wates, Kulon Progo","listing-location_Wirobrajan, Yogyakarta","listing-location_Wonosari, Gunung Kidul"
0,2,1,1,100,90,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
predicted_price = model.predict([new_df.iloc[0]])
print("Predicted Price:", predicted_price)

Predicted Price: [168178906.5300691]




In [103]:
%pip install Flask-SQLAlchemy

Collecting Flask-SQLAlchemy
  Downloading flask_sqlalchemy-3.1.1-py3-none-any.whl.metadata (3.4 kB)
Collecting sqlalchemy>=2.0.16 (from Flask-SQLAlchemy)
  Downloading SQLAlchemy-2.0.30-cp310-cp310-macosx_11_0_arm64.whl.metadata (9.6 kB)
Downloading flask_sqlalchemy-3.1.1-py3-none-any.whl (25 kB)
Downloading SQLAlchemy-2.0.30-cp310-cp310-macosx_11_0_arm64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m178.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: sqlalchemy, Flask-SQLAlchemy
Successfully installed Flask-SQLAlchemy-3.1.1 sqlalchemy-2.0.30
Note: you may need to restart the kernel to use updated packages.


In [104]:
%pip install --force-reinstall watchdog

Collecting watchdog
  Downloading watchdog-4.0.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (37 kB)
Downloading watchdog-4.0.1-cp310-cp310-macosx_11_0_arm64.whl (92 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.0/93.0 kB[0m [31m233.6 kB/s[0m eta [36m0:00:00[0m00:01[0m:01[0m
[?25hInstalling collected packages: watchdog
Successfully installed watchdog-4.0.1
Note: you may need to restart the kernel to use updated packages.
