# Import

In [31]:
%pip install tensorflow

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import json
import os
from tensorflow.keras import layers, models

Note: you may need to restart the kernel to use updated packages.


# Read CSV

In [32]:
# Specify the path to the CSV file
file_path = 'house_price_jabodetabek_dataset.csv'

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Print the Dataframe
df

Unnamed: 0,District,City,Latitude,Longitude,Price,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage
0,Babelan,Bekasi,-6.227721,107.001649,3.450000e+08,60.0,40.0,1.0,2.0,1.0,1.0
1,Pondokmelati,Bekasi,-6.296441,106.921566,1.265000e+09,90.0,110.0,2.0,3.0,3.0,2.0
2,Bekasi Utara,Bekasi,-6.211765,107.003851,1.300000e+09,80.0,120.0,2.0,3.0,3.0,2.0
3,Cikarang Pusat,Bekasi,-6.365165,107.182391,1.090000e+09,98.0,102.0,2.0,4.0,2.0,2.0
4,Mustikajaya,Bekasi,-6.303947,107.026896,8.000000e+08,96.0,75.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
10198,Pantai Mutiara,Jakarta Utara,-6.107730,106.791458,1.850000e+10,583.0,1300.0,1.0,5.0,6.0,1.0
10199,Kemang,Jakarta Selatan,-6.277783,106.811422,5.400000e+09,156.0,262.0,4.0,4.0,3.0,2.0
10200,Daan Mogot,Jakarta Barat,-6.170157,106.762833,1.700000e+09,90.0,120.0,2.0,2.0,2.0,1.0
10201,Semanan,Jakarta Barat,-6.164927,106.705773,2.800000e+09,120.0,136.0,2.0,3.0,3.0,2.0


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10203 entries, 0 to 10202
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   District          10203 non-null  object 
 1   City              10203 non-null  object 
 2   Latitude          10203 non-null  float64
 3   Longitude         10203 non-null  float64
 4   Price             10203 non-null  float64
 5   Land_Size_M2      10203 non-null  float64
 6   Building_Size_M2  10203 non-null  float64
 7   Floors            10203 non-null  float64
 8   Bedrooms          10203 non-null  float64
 9   Bathrooms         10203 non-null  float64
 10  Carport/Garage    10203 non-null  float64
dtypes: float64(9), object(2)
memory usage: 876.9+ KB


# Encoding

## Encoding District

Ini encoding berdasarkan median harga dari district tersebut

In [34]:
# Step 1: Calculate the median of "price_in_rp" for each district
district_medians = df.groupby('District')['Price'].median()

# Step 2: Sort the categories by their median "price_in_rp" in descending order
sorted_categories_district = district_medians.sort_values(ascending=False).index

# Step 3: Create a dictionary for encoding based on sorted order
encoding_district_dict = {district: idx for idx, district in enumerate(sorted_categories_district, start=1)}

# Print the encoding dictionary
print("Encoding Dictionary:", encoding_district_dict)

# Create a folder named "encoding" if it doesn't exist
encoding_folder = 'encoding'
if not os.path.exists(encoding_folder):
    os.makedirs(encoding_folder)

# Export the encoding to a JSON file in the "encoding" folder
json_file_path = os.path.join(encoding_folder, 'encoding_district_dict.json')
with open(json_file_path, 'w') as json_file:
    json.dump(encoding_district_dict, json_file, indent=4)

print(f"Dictionary has been exported to {json_file_path}")

# Step 4: Encode the "district" column using the encoding dictionary
df['District'] = df['District'].map(encoding_district_dict)

df

Encoding Dictionary: {'Menteng Atas': 1, 'Hasyim Ashari': 2, 'Mega Kuningan': 3, 'Menteng': 4, 'Senayan': 5, 'Scbd': 6, 'Blok M': 7, 'Simprug': 8, 'Permata Hijau': 9, 'Prapanca': 10, 'Slipi': 11, 'Senopati': 12, 'Menteng Dalam': 13, 'Kuningan': 14, 'Patra Kuningan': 15, 'Pakubuwono': 16, 'Sudirman': 17, 'Kebayoran Baru': 18, 'Bangka': 19, 'Pejompongan': 20, 'Pantai Mutiara': 21, 'Ancol': 22, 'Kebon Kacang': 23, 'Lenteng Agung': 24, 'Pondok Indah': 25, 'Bendungan Hilir': 26, 'Setia Budi': 27, 'Setiabudi': 28, 'Fatmawati': 29, 'Ragunan': 30, 'Kedoya': 31, 'Cipulir': 32, 'Cipete': 33, 'Guntur': 34, 'Intercon': 35, 'Ciganjur': 36, 'Mampang Prapatan': 37, 'Pulomas': 38, 'Pantai Indah Kapuk': 39, 'Ampera': 40, 'Tanjung Duren Utara': 41, 'Patal Senayan': 42, 'Panglima Polim': 43, 'Pluit': 44, 'Kebayoran Lama': 45, 'Tawakal': 46, 'Lebak Bulus': 47, 'Kedoya Baru': 48, 'Gajah Mada': 49, 'Tanah Abang': 50, 'Harmoni': 51, 'Pondok Hijau Golf': 52, 'Tanjung Duren Selatan': 53, 'Veteran': 54, 'Angke'

Unnamed: 0,District,City,Latitude,Longitude,Price,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage
0,309,Bekasi,-6.227721,107.001649,3.450000e+08,60.0,40.0,1.0,2.0,1.0,1.0
1,213,Bekasi,-6.296441,106.921566,1.265000e+09,90.0,110.0,2.0,3.0,3.0,2.0
2,278,Bekasi,-6.211765,107.003851,1.300000e+09,80.0,120.0,2.0,3.0,3.0,2.0
3,274,Bekasi,-6.365165,107.182391,1.090000e+09,98.0,102.0,2.0,4.0,2.0,2.0
4,283,Bekasi,-6.303947,107.026896,8.000000e+08,96.0,75.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
10198,21,Jakarta Utara,-6.107730,106.791458,1.850000e+10,583.0,1300.0,1.0,5.0,6.0,1.0
10199,58,Jakarta Selatan,-6.277783,106.811422,5.400000e+09,156.0,262.0,4.0,4.0,3.0,2.0
10200,171,Jakarta Barat,-6.170157,106.762833,1.700000e+09,90.0,120.0,2.0,2.0,2.0,1.0
10201,173,Jakarta Barat,-6.164927,106.705773,2.800000e+09,120.0,136.0,2.0,3.0,3.0,2.0


# Encoding City

Ini encoding berdasarkan median harga dari city tersebut

In [35]:
# Step 1: Calculate the median of "price_in_rp" for each city
city_medians = df.groupby('City')['Price'].median()

# Step 2: Sort the categories by their median "price_in_rp" in descending order
sorted_categories_city = city_medians.sort_values(ascending=False).index

# Step 3: Create a dictionary for encoding based on sorted order
encoding_city_dict = {city: idx for idx, city in enumerate(sorted_categories_city, start=1)}

# Print the encoding dictionary
print("Encoding Dictionary:", encoding_city_dict)

# Export the encoding to a JSON file in the "encoding" folder
json_file_path = os.path.join(encoding_folder, 'encoding_city_dict.json')
with open(json_file_path, 'w') as json_file:
    json.dump(encoding_city_dict, json_file, indent=4)

print(f"Dictionary has been exported to {json_file_path}")

# Step 4: Encode the "city" column using the encoding dictionary
df['City'] = df['City'].map(encoding_city_dict)

df

Encoding Dictionary: {'Jakarta Selatan': 1, 'Jakarta Utara': 2, 'Jakarta Pusat': 3, 'Jakarta Barat': 4, 'Jakarta Timur': 5, 'Tangerang': 6, 'Depok': 7, 'Bogor': 8, 'Bekasi': 9}
Dictionary has been exported to encoding\encoding_city_dict.json


Unnamed: 0,District,City,Latitude,Longitude,Price,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage
0,309,9,-6.227721,107.001649,3.450000e+08,60.0,40.0,1.0,2.0,1.0,1.0
1,213,9,-6.296441,106.921566,1.265000e+09,90.0,110.0,2.0,3.0,3.0,2.0
2,278,9,-6.211765,107.003851,1.300000e+09,80.0,120.0,2.0,3.0,3.0,2.0
3,274,9,-6.365165,107.182391,1.090000e+09,98.0,102.0,2.0,4.0,2.0,2.0
4,283,9,-6.303947,107.026896,8.000000e+08,96.0,75.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
10198,21,2,-6.107730,106.791458,1.850000e+10,583.0,1300.0,1.0,5.0,6.0,1.0
10199,58,1,-6.277783,106.811422,5.400000e+09,156.0,262.0,4.0,4.0,3.0,2.0
10200,171,4,-6.170157,106.762833,1.700000e+09,90.0,120.0,2.0,2.0,2.0,1.0
10201,173,4,-6.164927,106.705773,2.800000e+09,120.0,136.0,2.0,3.0,3.0,2.0


# Normalization

Using the max-min normalization

In [36]:
# Compute maximum and minimum values for each column
max_values = df.max()
min_values = df.min()

# Construct dictionary with maximum and minimum values for each column
result_dict = {}
for column in df.columns:
  if(column == "Price"):
    continue
  result_dict[f"{column}_max"] = max_values[column]
  result_dict[f"{column}_min"] = min_values[column]

# Specify the file path where you want to save the JSON
json_file_path = os.path.join(encoding_folder, 'max_min_values.json')

# Write the dictionary to a JSON file
with open(json_file_path, 'w') as json_file:
  json.dump(result_dict, json_file, indent=4)

print(f"Dictionary has been exported to {json_file_path}")

Dictionary has been exported to encoding\max_min_values.json


In [37]:
# Function to normalize a column using min-max scaling
def min_max_normalize(column):
    min_val = column.min()
    max_val = column.max()
    normalized_column = (column - min_val) / (max_val - min_val)
    return normalized_column

# Apply normalization to all columns except "price_in_rp"
normalized_columns = df.drop(columns=['Price']).apply(min_max_normalize)

# Add the "price_in_rp" column back to the normalized DataFrame
df = pd.concat([normalized_columns, df['Price']], axis=1)

df

Unnamed: 0,District,City,Latitude,Longitude,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage,Price
0,0.903226,1.000,0.157189,0.994624,0.004161,0.004279,0.000000,0.018182,0.000000,0.066667,3.450000e+08
1,0.621701,1.000,0.147901,0.993879,0.007283,0.020045,0.076923,0.036364,0.036364,0.133333,1.265000e+09
2,0.812317,1.000,0.159346,0.994644,0.006242,0.022297,0.076923,0.036364,0.036364,0.133333,1.300000e+09
3,0.800587,1.000,0.138611,0.996304,0.008115,0.018243,0.076923,0.054545,0.018182,0.133333,1.090000e+09
4,0.826979,1.000,0.146886,0.994858,0.007907,0.012162,0.000000,0.036364,0.000000,0.066667,8.000000e+08
...,...,...,...,...,...,...,...,...,...,...,...
10198,0.058651,0.125,0.173409,0.992670,0.058573,0.288063,0.000000,0.072727,0.090909,0.066667,1.850000e+10
10199,0.167155,0.000,0.150422,0.992855,0.014149,0.054279,0.230769,0.054545,0.036364,0.133333,5.400000e+09
10200,0.498534,0.375,0.164970,0.992403,0.007283,0.022297,0.076923,0.018182,0.018182,0.066667,1.700000e+09
10201,0.504399,0.375,0.165677,0.991873,0.010404,0.025901,0.076923,0.036364,0.036364,0.133333,2.800000e+09


# Data Split

In [38]:
# from sklearn.model_selection import StratifiedShuffleSplit

# # Initialize the StratifiedShuffleSplit
# strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# # Split the DataFrame
# for train_index, test_index in strat_split.split(df, df['City']):
#     train_df = df.iloc[train_index]
#     test_df = df.iloc[test_index]

from sklearn.model_selection import train_test_split

# Initialize empty DataFrames for train and test
train_df = pd.DataFrame()
test_df = pd.DataFrame()

# Iterate over each unique city
for city in df['City'].unique():
    # Filter the DataFrame for the current city
    city_df = df[df['City'] == city]
    
    # Perform the train-test split for the current city
    city_train, city_test = train_test_split(city_df, test_size=0.2, random_state=42)
    
    # Append the split data to the respective DataFrames
    train_df = pd.concat([train_df, city_train])
    test_df = pd.concat([test_df, city_test])


unique_values_counts = df['City'].value_counts()
print("\nAll set unique value counts\n", unique_values_counts)
unique_values_counts = train_df['City'].value_counts()
print("\nTrain set unique value counts\n", unique_values_counts)
unique_values_counts = test_df['City'].value_counts()
print("\nTest set unique value counts\n", unique_values_counts)


All set unique value counts
 City
1.000    1841
0.875    1700
0.750    1357
0.000    1240
0.625    1193
0.375     995
0.125     928
0.500     697
0.250     252
Name: count, dtype: int64

Train set unique value counts
 City
1.000    1472
0.875    1360
0.750    1085
0.000     992
0.625     954
0.375     796
0.125     742
0.500     557
0.250     201
Name: count, dtype: int64

Test set unique value counts
 City
1.000    369
0.875    340
0.750    272
0.000    248
0.625    239
0.375    199
0.125    186
0.500    140
0.250     51
Name: count, dtype: int64


In [39]:
train_df

Unnamed: 0,District,City,Latitude,Longitude,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage,Price
588,0.759531,1.000,0.145301,0.994624,0.008115,0.039414,0.076923,0.036364,0.018182,0.133333,2.000000e+09
949,0.741935,1.000,0.163480,0.994440,0.009571,0.016667,0.000000,0.036364,0.018182,0.066667,1.050000e+09
1631,0.812317,1.000,0.157189,0.994624,0.016646,0.035811,0.076923,0.054545,0.054545,0.133333,5.500000e+09
1719,0.806452,1.000,0.145776,0.994804,0.005202,0.007658,0.000000,0.018182,0.000000,0.066667,5.650000e+08
1338,0.741935,1.000,0.163480,0.994440,0.005618,0.020045,0.076923,0.054545,0.036364,0.066667,1.050000e+09
...,...,...,...,...,...,...,...,...,...,...,...
6748,0.472141,0.375,0.169174,0.992103,0.006866,0.017793,0.076923,0.018182,0.018182,0.000000,1.775000e+09
7525,0.472141,0.375,0.169476,0.992079,0.002601,0.015541,0.076923,0.018182,0.018182,0.000000,1.300000e+09
9596,0.158358,0.375,0.168310,0.992709,0.008115,0.082658,0.307692,0.290909,0.290909,0.133333,5.790000e+09
8032,0.287390,0.375,0.164346,0.992749,0.027778,0.051577,0.076923,0.036364,0.036364,0.200000,7.000000e+09


In [40]:
test_df

Unnamed: 0,District,City,Latitude,Longitude,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage,Price
1557,0.832845,1.000,0.143919,0.995776,0.002705,0.014189,0.000000,0.018182,0.018182,0.066667,7.400000e+08
1157,0.744868,1.000,0.156711,0.994235,0.012068,0.044369,0.076923,0.036364,0.036364,0.066667,2.500000e+09
352,0.759531,1.000,0.145301,0.994624,0.008115,0.039414,0.076923,0.054545,0.036364,0.066667,2.000000e+09
1018,0.826979,1.000,0.147941,0.994808,0.005410,0.007658,0.000000,0.018182,0.000000,0.066667,5.950000e+08
1394,0.812317,1.000,0.159346,0.994644,0.023928,0.022297,0.000000,0.036364,0.036364,0.200000,1.750000e+09
...,...,...,...,...,...,...,...,...,...,...,...
8196,0.428152,0.375,0.163093,0.992452,0.010092,0.079054,0.153846,0.054545,0.054545,0.066667,8.500000e+09
8068,0.369501,0.375,0.164702,0.992447,0.003121,0.022297,0.153846,0.036364,0.036364,0.000000,1.750000e+09
6562,0.225806,0.375,0.161708,0.992687,0.021432,0.080405,0.153846,0.163636,0.163636,0.000000,8.900000e+09
6983,0.428152,0.375,0.163093,0.992452,0.001561,0.019369,0.153846,0.036364,0.036364,0.000000,1.195000e+09


In [41]:
X_train = train_df.drop(['Price'],axis=1)
y_train = train_df['Price']
y_train = y_train/1000000000

In [42]:
X_test = test_df.drop(['Price'],axis=1)
y_test = test_df['Price']
y_test = y_test/1000000000

# Machine Learning

In [43]:
%pip install tensorflow tensorflow_decision_forests

Note: you may need to restart the kernel to use updated packages.


In [57]:
from sklearn.metrics import r2_score

# Define the input shape
input_shape = X_train.shape[1:]

# Define the model architecture with increased complexity
model = models.Sequential([
    layers.Dense(1024, activation="relu", input_shape=input_shape),
    layers.Dense(1024, activation="relu"),
    layers.Dense(1024, activation="relu"),
    layers.Dense(1)
])

# Compile the model with a lower learning rate
optimizer = tf.keras.optimizers.Adam()  # Adjust learning rate
model.compile(optimizer=optimizer, loss='mse', metrics=[tf.keras.metrics.R2Score()])

# Train the model with more epochs
history = model.fit(X_train, y_train, epochs=10000, verbose=1, validation_data=(X_test,y_test))

# Calculate R-squared (R2) score
y_train_pred = model.predict(X_train)
train_r2_score = r2_score(y_train, y_train_pred)

print("Training R2 score:", train_r2_score)

Epoch 1/10000

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import r2_score
from tensorflow.keras import models, layers, callbacks

# Define the input shape
input_shape = X_train.shape[1:]

# Define the model architecture with enhancements
model = models.Sequential([
    layers.Dense(1024, activation="relu", input_shape=input_shape),
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(1024, activation="relu"),
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(1024, activation="relu"),
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(1)
])

# Compile the model with a learning rate scheduler
optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer=optimizer, loss='mse', metrics=[tf.keras.metrics.MeanSquaredError()])

# Implement early stopping and learning rate scheduler
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_scheduler = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

# Train the model with callbacks
history = model.fit(X_train, y_train, epochs=10000, verbose=1, validation_data=(X_test, y_test), callbacks=[early_stopping, lr_scheduler])

# Calculate R-squared (R2) score
y_train_pred = model.predict(X_train)
train_r2_score = r2_score(y_train, y_train_pred)

print("Training R2 score:", train_r2_score)