In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import src.util as utils

In [3]:
konfig = utils.load_params(str(utils.get_dir()) + utils.get_params())

In [4]:
konfig

{'dir_dataset': 'data/raw/',
 'dataset_cleaned_path': 'data/processed/raw_dataset.pkl',
 'train_set_path': ['data/processed/x_train.pkl',
  'data/processed/y_train.pkl'],
 'test_set_path': ['data/processed/x_test.pkl', 'data/processed/y_test.pkl'],
 'train_clean_set_path': ['data/processed/x_train_clean.pkl',
  'data/processed/y_train_clean.pkl'],
 'production_model_path': 'models/production_model.pkl',
 'ohe_stasiun_path': 'models/ohe_stasiun.pkl',
 'le_encoder_path': 'models/le_encoder.pkl',
 'training_log_path': 'log/training_log.json',
 'kolom_int': ['HARGA', 'LB', 'LT', 'KT', 'KM', 'GRS'],
 'blok_LB': [150, 5],
 'blok_LT': [200, 5],
 'rentang_harga': [300000000, 70000000000],
 'rentang_LB': [30, 2000],
 'rentang_LT': [20, 2000],
 'rentang_KT': [1, 15],
 'rentang_KM': [1, 15],
 'rentang_GRS': [0, 15],
 'prediktor': ['LB', 'LT', 'KT', 'KM', 'GRS'],
 'label': ['HARGA']}

In [7]:
dir_data_raw = str(utils.get_dir()) + utils.cek_path_os(konfig["dir_dataset"])
dir_data_raw 

'C:\\Users\\ilham.faisal\\OneDrive - PT. Indonesia Comnets Plus\\Belajar\\Proyek Data Science\\Harga Rumah Tebet\\data\\raw\\'

In [8]:
data_rumah = pd.read_excel(dir_data_raw + "DATA RUMAH.xlsx")

In [9]:
data_rumah

Unnamed: 0,NO,NAMA RUMAH,HARGA,LB,LT,KT,KM,GRS
0,1,"Rumah Murah Hook Tebet Timur, Tebet, Jakarta S...",3800000000,220,220,3,3,0
1,2,"Rumah Modern di Tebet dekat Stasiun, Tebet, Ja...",4600000000,180,137,4,3,2
2,3,"Rumah Mewah 2 Lantai Hanya 3 Menit Ke Tebet, T...",3000000000,267,250,4,4,4
3,4,"Rumah Baru Tebet, Tebet, Jakarta Selatan",430000000,40,25,2,2,0
4,5,"Rumah Bagus Tebet komp Gudang Peluru lt 350m, ...",9000000000,400,355,6,5,3
...,...,...,...,...,...,...,...,...
1005,1006,Rumah Strategis Akses Jalan 2mobil Di Menteng ...,9000000000,450,550,10,10,3
1006,1007,Tebet Rumah Siap Huni Jln 2 Mbl Nyaman,4000000000,160,140,4,3,2
1007,1008,"Di Kebun Baru Rumah Terawat, Area Strategis",4000000000,139,230,4,4,1
1008,1009,Dijual Cepat Rumah Komp Depkeu Dr Soepomo Tebe...,19000000000,360,606,7,4,0


In [9]:
data_rumah.shape

(1010, 8)

In [10]:
data_rumah.isnull().sum()

NO            0
NAMA RUMAH    0
HARGA         0
LB            0
LT            0
KT            0
KM            0
GRS           0
dtype: int64

In [11]:
data_rumah.dtypes

NO             int64
NAMA RUMAH    object
HARGA          int64
LB             int64
LT             int64
KT             int64
KM             int64
GRS            int64
dtype: object

In [12]:
data_rumah.describe()

Unnamed: 0,NO,HARGA,LB,LT,KT,KM,GRS
count,1010.0,1010.0,1010.0,1010.0,1010.0,1010.0,1010.0
mean,505.5,7628987000.0,276.539604,237.432673,4.668317,3.607921,1.920792
std,291.706188,7340946000.0,177.864557,179.957604,1.572776,1.420066,1.510998
min,1.0,430000000.0,40.0,25.0,2.0,1.0,0.0
25%,253.25,3262500000.0,150.0,130.0,4.0,3.0,1.0
50%,505.5,5000000000.0,216.5,165.0,4.0,3.0,2.0
75%,757.75,9000000000.0,350.0,290.0,5.0,4.0,2.0
max,1010.0,65000000000.0,1126.0,1400.0,10.0,10.0,10.0


In [5]:
def cek_data(dataset, konfig):
    len_dataset = len(dataset)
    
    #cek tipe data
    assert dataset.select_dtypes("int").columns.to_list()[1:] == konfig["kolom_int"], "eror terjadi pada kolom int"
    
    #cek rentang data
    assert dataset[konfig["kolom_int"][0]].between(konfig["rentang_harga"][0], konfig["rentang_harga"][1]).sum() == len_dataset, "eror pada rentang harga"
    assert dataset[konfig["kolom_int"][1]].between(konfig["rentang_LB"][0], konfig["rentang_LB"][1]).sum() == len_dataset, "eror pada rentang Luas Bangunan"
    assert dataset[konfig["kolom_int"][2]].between(konfig["rentang_LT"][0], konfig["rentang_LT"][1]).sum() == len_dataset, "eror pada rentang Luas Tanah"
    assert dataset[konfig["kolom_int"][3]].between(konfig["rentang_KT"][0], konfig["rentang_KT"][1]).sum() == len_dataset, "eror pada rentang Jumlah Kamar Tidur"
    assert dataset[konfig["kolom_int"][4]].between(konfig["rentang_KM"][0], konfig["rentang_KM"][1]).sum() == len_dataset, "eror pada rentang Jumlah Kamar Mandi"
    assert dataset[konfig["kolom_int"][5]].between(konfig["rentang_GRS"][0], konfig["rentang_GRS"][1]).sum() == len_dataset, "eror pada rentang Garasi"
                                                      

In [10]:
cek_data(data_rumah, konfig)

In [28]:
x = data_rumah[konfig["prediktor"]].copy()
y = data_rumah[konfig["label"]].copy()

In [16]:
x

Unnamed: 0,LB,LT,KT,KM,GRS
0,220,220,3,3,0
1,180,137,4,3,2
2,267,250,4,4,4
3,40,25,2,2,0
4,400,355,6,5,3
...,...,...,...,...,...
1005,450,550,10,10,3
1006,160,140,4,3,2
1007,139,230,4,4,1
1008,360,606,7,4,0


In [29]:
y

Unnamed: 0,HARGA
0,3800000000
1,4600000000
2,3000000000
3,430000000
4,9000000000
...,...
1005,9000000000
1006,4000000000
1007,4000000000
1008,19000000000


In [30]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 10)

In [19]:
print("X_train", len(X_train))
print("X_test", len(X_test))
print("y_train", len(y_train))
print("y_test", len(y_test))   

X_train 707
X_test 303
y_train 707
y_test 303


In [31]:
X_train.isnull().sum()

LB     0
LT     0
KT     0
KM     0
GRS    0
dtype: int64

In [32]:
y_train

Unnamed: 0,HARGA
846,2600000000
35,4000000000
728,16000000000
811,12500000000
293,2999000000
...,...
320,3750000000
527,8800000000
996,3800000000
125,4000000000


In [34]:
utils.pickle_dump(data_rumah, konfig["dataset_cleaned_path"])

In [39]:
utils.pickle_dump(X_train, konfig["train_set_path"][0])
utils.pickle_dump(y_train, konfig["train_set_path"][1])

utils.pickle_dump(X_test, konfig["test_set_path"][0])
utils.pickle_dump(y_test, konfig["test_set_path"][1])
