In [11]:
import pandas as pd
from IPython.display import display

In [12]:
df = pd.read_csv('data-source/house_listings.csv')
# Transformasi kolom price dan price_1m2 menjadi nilai numerik
df['price'] = df['price'].str.replace(' ', '').astype(float)
df['price_1m2'] = df['price_1m2'].str.replace(' AZN/m²', '').str.replace(' ', '').astype(float)

# Mencari nilai minimum dan maksimum
min_price = df['price'].min()
max_price = df['price'].max()
min_price_per_m2 = df['price_1m2'].min()
max_price_per_m2 = df['price_1m2'].max()

# Melakukan normalisasi Min-Max pada kolom harga rumah dan harga per meter persegi
df['normalized_price'] = (df['price'] - min_price) / (max_price - min_price)
df['normalized_price_per_m2'] = (df['price_1m2'] - min_price_per_m2) / (max_price_per_m2 - min_price_per_m2)

print("Menampilkan data awal dan data normalisasi:")
display(df[['price', 'normalized_price', 'price_1m2', 'normalized_price_per_m2']])

Menampilkan data awal dan data normalisasi:


Unnamed: 0,price,normalized_price,price_1m2,normalized_price_per_m2
0,137000.0,0.021298,2630.0,0.116922
1,183000.0,0.030369,1790.0,0.076321
2,145000.0,0.022875,2230.0,0.097588
3,190000.0,0.031749,2000.0,0.086471
4,294000.0,0.052258,1550.0,0.064720
...,...,...,...,...
35498,135000.0,0.020903,2080.0,0.090338
35499,245000.0,0.042595,2720.0,0.121272
35500,309000.0,0.055216,2310.0,0.101455
35501,292000.0,0.051864,2150.0,0.093721


In [13]:
# Melakukan one-hot encoding pada kolom kategorikal
df = pd.get_dummies(df, columns=['category', 'title_deed', 'repair', 'mortgage'])
display(df.head(10))

Unnamed: 0,price,currency,price_1m2,title,address,floor,area,url,room_number,normalized_price,normalized_price_per_m2,category_Köhnə tikili,category_Yeni tikili,title_deed_var,title_deed_yoxdur,repair_var,repair_yoxdur,mortgage_var
0,137000.0,AZN,2630.0,"Satılır 3 otaqlı köhnə tikili 52 m², 8-ci kilo...",Elşən Süleymanov küç 137,7 / 9,52 m²,https://bina.az/items/3858477,3.0,0.021298,0.116922,True,False,True,False,True,False,False
1,183000.0,AZN,1790.0,"Satılır 2 otaqlı yeni tikili 102 m², Neftçilər m.",Mehdi Abbasov küçəsi,2 / 17,102 m²,https://bina.az/items/3858493,2.0,0.030369,0.076321,False,True,False,True,True,False,False
2,145000.0,AZN,2230.0,"Satılır 2 otaqlı köhnə tikili 65 m², Nərimanov r.",Atatürk Prospekti,6 / 9,65 m²,https://bina.az/items/3858489,2.0,0.022875,0.097588,True,False,True,False,True,False,True
3,190000.0,AZN,2000.0,"Satılır 3 otaqlı köhnə tikili 95 m², Gənclik m.",Atatürk pr.,4 / 9,95 m²,https://bina.az/items/3858491,3.0,0.031749,0.086471,True,False,True,False,True,False,True
4,294000.0,AZN,1550.0,"Satılır 3 otaqlı yeni tikili 190 m², Nəsimi r.",Möhsün Sənani küçəsi,6 / 16,190 m²,https://bina.az/items/3858488,3.0,0.052258,0.06472,False,True,True,False,False,True,False
5,87000.0,AZN,1930.0,"Satılır 1 otaqlı köhnə tikili 45 m², Nəsimi m.",Svetlana Məmmədova 210,5 / 5,45 m²,https://bina.az/items/3845898,1.0,0.011438,0.083088,True,False,True,False,True,False,False
6,550000.0,AZN,2530.0,"Satılır 5 otaqlı yeni tikili 217 m², Nəriman N...",Qarabağ küç 55,15 / 19,217 m²,https://bina.az/items/3719344,5.0,0.102741,0.112089,False,True,False,True,True,False,False
7,125000.0,AZN,2160.0,"Satılır 2 otaqlı yeni tikili 58 m², Yeni Yasam...",Məhəmməd Xiyabani küçəsi,7 / 16,58 m²,https://bina.az/items/3858490,2.0,0.018931,0.094205,False,True,False,True,True,False,False
8,150000.0,AZN,2730.0,"Satılır 2 otaqlı köhnə tikili 55 m², Nizami m.",Nəriman Nərimanov pr.,2 / 6,55 m²,https://bina.az/items/3788954,2.0,0.023861,0.121756,True,False,True,False,True,False,False
9,62000.0,AZN,1510.0,"Satılır 2 otaqlı yeni tikili 41 m², Masazır q.",İstiqlaliyyət küçəsi,3 / 10,41 m²,https://bina.az/items/3858482,2.0,0.006508,0.062787,False,True,True,False,True,False,True


In [14]:
# Kelompokkan data berdasarkan jumlah kamar dan tipe bangunan
category = df.groupby(['room_number', 'category_Köhnə tikili'])

# Hitung rata-rata, median, dan modus dari harga rumah di setiap kelompok
aggregated = category['price'].agg(['mean', 'median', lambda x: x.mode().iloc[0]])

# Ubah nama kolom agar lebih deskriptif
aggregated.columns = ['mean_price', 'median_price', 'mode_price']

# Tampilkan hasilnya
print("Menampilkan hasil agregasi data:")
display(aggregated)

Menampilkan hasil agregasi data:


Unnamed: 0_level_0,Unnamed: 1_level_0,mean_price,median_price,mode_price
room_number,category_Köhnə tikili,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,False,122202.7,120000.0,130000.0
1.0,True,91348.51,88000.0,95000.0
2.0,False,164158.1,155000.0,145000.0
2.0,True,120029.8,115000.0,120000.0
3.0,False,255388.4,245000.0,250000.0
3.0,True,166688.9,153500.0,175000.0
4.0,False,405546.3,370000.0,350000.0
4.0,True,211675.5,180000.0,165000.0
5.0,False,627817.4,520000.0,450000.0
5.0,True,276583.5,201500.0,220000.0


In [15]:
# Menghapus karakter 'm²' dari kolom 'area' dan mengonversi ke numerik
df['area'] = df['area'].str.replace(' m²', '').astype(float)

# Mengidentifikasi nilai yang hilang
missing_values = df[['area', 'floor']].isnull().sum()
print("Jumlah nilai yang hilang pada kolom 'area' dan 'floor':")
print(missing_values)

# Mengisi nilai yang hilang
mean_area = df['area'].mean()
df['area'] = df['area'].fillna(mean_area)
df['floor'] = df['floor'].fillna('none')

# Memastikan tidak ada lagi nilai yang hilang
print("Jumlah nilai yang hilang setelah diisi:")
print(df[['area', 'floor']].isnull().sum())


Jumlah nilai yang hilang pada kolom 'area' dan 'floor':
area     37
floor    37
dtype: int64
Jumlah nilai yang hilang setelah diisi:
area     0
floor    0
dtype: int64


In [16]:
# Mengidentifikasi outlier menggunakan metode IQR
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]
print("Jumlah outlier sebelum dihapus:")
print(len(outliers))

# Menangani outlier dengan menghapusnya
df = df[~((df['price'] < lower_bound) | (df['price'] > upper_bound))]

# Memastikan tidak ada lagi outlier
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]
print("Jumlah outlier setelah dihapus:")
print(len(outliers))


Jumlah outlier sebelum dihapus:
1930
Jumlah outlier setelah dihapus:
0


In [18]:
df.to_csv('house_listings_clean.csv', index=False)