In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [None]:
df = pd.read_csv('../data/process_v1/process_data_6.csv')
df = df[df['city'] == 0]

In [None]:
df = df[df['landSize'] <= 5000]

In [None]:
df['is_street_house'] = (df['accessibility'] == 2).astype(np.int32)

In [None]:
df['landSize_ratio'] = df['landSize'] / df['acreage']

In [None]:
mean_land_size_df = df.groupby('administrative_genre')['landSize'].mean().reset_index().rename(columns = {'landSize': 'meanLandSize'})

df = df.merge(mean_land_size_df, how='left', on = 'administrative_genre')

df['landSize_ratio_with_administrative_genre'] = df['landSize'] / df['meanLandSize']
df['acreage_ratio_with_meanLandSize'] = df['acreage'] / df['meanLandSize']


del df['meanLandSize']

def partition_for_alley_width(alley_width):
    if math.isnan(alley_width):
        return np.nan
    if alley_width <= 2.5:
        return 1
    if alley_width <= 5:
        return 2
    if alley_width <= 7:
        return 3
    if alley_width <= 8:
        return 4

In [None]:
{'fitThreeCars': 0,
 'fitOneCarAndOneMotorbike': 1,
 'notInTheAlley': 2,
 'parkCar': 3,
 'theBottleNeckPoint': 4,
 'fitTwoCars': 5,
 'narrorRoad': 6}

In [None]:
df.columns.tolist()

In [None]:
s1 = u'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
s0 = u'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy'
def remove_accents(input_str):
	s = ''
	for c in input_str:
		if c in s1:
			s += s0[s1.index(c)]
		else:
			s += c

	s = "".join([c for c in list(s) if c == '_' or c.isalnum()])
	return s

full_cols = df.columns.tolist()
format_full_cols = [remove_accents(col) for col in full_cols]

df = df.rename(columns = dict(zip(full_cols, format_full_cols)))

In [None]:
cat_cols = ['is_street_house', 'ward', 'street', 'district', 'typeOfRealEstate', 'houseDirection', 'accessibility', 'certificateOfLandUseRight'] + [c for c in df.columns.tolist() if ('_district' in c or '_ward' in c or '_street' in c) and 'distance' not in c and 'num' not in c]
target_cols = ['target']
remove_cols = ['price', 'description', 'city', 'endWidth'] + [c for c in df.columns if 'distance_hcm' in c]
time_cols = ['time']
num_cols = [c for c in df.columns.tolist() if c not in (cat_cols + target_cols + remove_cols + time_cols)]

cat_cols = list(set(cat_cols))
target_cols = list(set(target_cols))
remove_cols = list(set(remove_cols))
num_cols = list(set(num_cols))

print(len(cat_cols), len(target_cols), len(remove_cols), len(num_cols))

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
print(len(num_cols))

In [None]:
df.columns.tolist()

In [None]:
df[cat_cols] = df[cat_cols].fillna(100)
df[cat_cols] = df[cat_cols].astype(np.int32)

In [None]:
df[num_cols]

In [None]:
df = df[cat_cols + num_cols + time_cols + target_cols]

In [None]:
df = df[df[target_cols[0]] != np.inf]
df = df[df[target_cols[0]] != -np.inf]

In [None]:
df.shape

In [None]:
df = df[df[target_cols[0]] >= 10]
df = df[df[target_cols[0]] < 100 * 10]
# df = pd.concat([df[df['w'] < 50], df[df['w'].isnull()]])
# df = pd.concat([df[df['h'] < 50], df[df['w'].isnull()]])

In [None]:
for col in num_cols:
    df[col] = df[col].fillna(df[col].mean())

In [None]:
for c in df.columns.tolist():
    if df[c].isnull().sum():
        print(c)
        del df[c]

In [None]:
gmm_config = {
    'num_of_restaurant_in_2000m_radius': 2,
    'num_of_restaurant_in_1000m_radius': 2,
    'num_of_bank_in_2000m_radius': 3,
    'num_of_marketplace_in_2000m_radius': 2,
    'num_of_cafe_in_1000m_radius': 2
}

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
for col in gmm_config.keys():
    gmm = GaussianMixture(n_components=gmm_config[col], random_state=42)
    gmm.fit(df[col].values.reshape(-1, 1))
    df[f'gmm_{gmm_config[col]}_component_{col}'] = gmm.predict(df[col].values.reshape(-1, 1))

In [None]:
cat_cols = ['is_street_house', 'ward', 'street', 'district', 'typeOfRealEstate', 'houseDirection', 'accessibility', 'certificateOfLandUseRight'] + [c for c in df.columns.tolist() if ('_district' in c or '_ward' in c or '_street' in c) and 'distance' not in c and 'num' not in c]
cat_cols += [c for c in df.columns if 'gmm' in c]
cat_cols = list(set(cat_cols))
target_cols = ['target']
remove_cols = ['price', 'description', 'city', 'endWidth'] + [c for c in df.columns if 'distance_hcm' in c]
time_cols = ['time']
num_cols = [c for c in df.columns.tolist() if c not in (cat_cols + target_cols + remove_cols + time_cols)]

cat_cols = list(set(cat_cols))
target_cols = list(set(target_cols))
remove_cols = list(set(remove_cols))
num_cols = list(set(num_cols))

print(len(cat_cols), len(target_cols), len(remove_cols), len(num_cols))

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(df[num_cols])

component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_pca.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from sklearn.feature_selection import mutual_info_regression

plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)


def plot_variance(pca, width=12, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    print("EVR:", evr)
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Num of component", title="% Explained Variance in Realestate Data", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance in Realestate Data", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=12, dpi=100)
    return axs
plot_variance(pca)

In [35]:
X_pca

Unnamed: 0,PC1,PC2
0,-115.896155,-164.468207
1,-16.381738,-83.304487
2,-91.876606,-91.527838
3,-84.010559,-88.528665
4,-98.295586,351.863612
...,...,...
44144,72.739731,-182.071338
44145,-122.047579,-165.515336
44146,-87.789193,-158.839148
44147,-77.178995,10.807215


In [36]:
for i in range(X_pca.shape[1]):
    df[f"PC{i+1}"] = X_pca[f"PC{i+1}"]
df

  df[f"PC{i+1}"] = X_pca[f"PC{i+1}"]
  df[f"PC{i+1}"] = X_pca[f"PC{i+1}"]


Unnamed: 0,nearest_4_district,nearest_8_street,nearest_8_district,nearest_2_ward,nearest_4_ward,nearest_8_ward,is_street_house,typeOfRealEstate,accessibility,nearest_6_street,...,num_of_kindergarten_in_2000m_radius,time,target,gmm_2_component_num_of_restaurant_in_2000m_radius,gmm_2_component_num_of_restaurant_in_1000m_radius,gmm_3_component_num_of_bank_in_2000m_radius,gmm_2_component_num_of_marketplace_in_2000m_radius,gmm_2_component_num_of_cafe_in_1000m_radius,PC1,PC2
0,100,100,100,541,100,100,0,0,0,100,...,2,2023-12-27T00:00:00,25.862069,0,0,1,1,0,-115.896155,-164.468207
1,1,2039,1,1,1,1,0,0,0,2155,...,2,2023-12-25T00:00:00,56.666667,0,1,1,1,1,-16.381738,-83.304487
2,1,1488,1,203,5,5,0,3,0,3374,...,4,2023-12-30T00:00:00,184.615385,0,0,1,1,0,-91.876606,-91.527838
3,10,3642,10,11,11,11,0,3,0,3382,...,9,2023-11-27T00:00:00,168.235294,0,0,0,1,0,-84.010559,-88.528665
4,11,3270,11,13,13,13,0,1,0,3249,...,13,2024-01-03T00:00:00,53.846154,1,1,0,1,1,-98.295586,351.863612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48888,35,100,100,461,404,100,0,1,1,100,...,0,2023-12-13T00:00:00,21.311475,0,0,1,1,0,,
48889,13,3576,13,389,347,347,0,1,4,133,...,3,2023-12-14T00:00:00,65.384615,0,0,1,1,0,,
48890,15,3365,15,152,89,152,0,2,1,4010,...,2,2023-12-14T00:00:00,81.395349,0,0,1,1,0,,
48891,26,456,40,280,61,280,0,3,0,2084,...,13,2023-12-01T00:00:00,281.250000,1,1,0,0,0,,


In [39]:
cat_cols = ['is_street_house', 'ward', 'street', 'district', 'typeOfRealEstate', 'houseDirection', 'accessibility', 'certificateOfLandUseRight'] + [c for c in df.columns.tolist() if ('_district' in c or '_ward' in c or '_street' in c) and 'distance' not in c and 'num' not in c]
cat_cols += [c for c in df.columns if 'gmm' in c]
cat_cols = list(set(cat_cols))
target_cols = ['target']
remove_cols = ['price', 'description', 'city', 'endWidth'] + [c for c in df.columns if 'distance_hcm' in c]
time_cols = ['time']
num_cols = [c for c in df.columns.tolist() if c not in (cat_cols + target_cols + remove_cols + time_cols)]

cat_cols = list(set(cat_cols))
target_cols = list(set(target_cols))
remove_cols = list(set(remove_cols))
num_cols = list(set(num_cols))

print(len(cat_cols), len(target_cols), len(remove_cols), len(num_cols))

40 1 4 150


In [40]:
cat_cols

['nearest_4_district',
 'nearest_8_street',
 'nearest_8_district',
 'nearest_2_ward',
 'nearest_4_ward',
 'nearest_8_ward',
 'is_street_house',
 'typeOfRealEstate',
 'accessibility',
 'nearest_6_street',
 'gmm_2_component_num_of_restaurant_in_1000m_radius',
 'gmm_3_component_num_of_bank_in_2000m_radius',
 'nearest_5_district',
 'ward',
 'nearest_0_district',
 'certificateOfLandUseRight',
 'houseDirection',
 'gmm_2_component_num_of_marketplace_in_2000m_radius',
 'gmm_2_component_num_of_restaurant_in_2000m_radius',
 'street',
 'nearest_7_district',
 'nearest_3_ward',
 'nearest_6_district',
 'nearest_3_street',
 'district',
 'nearest_6_ward',
 'nearest_7_street',
 'nearest_1_district',
 'nearest_5_street',
 'nearest_1_street',
 'nearest_5_ward',
 'gmm_2_component_num_of_cafe_in_1000m_radius',
 'nearest_4_street',
 'nearest_3_district',
 'nearest_0_ward',
 'nearest_7_ward',
 'nearest_0_street',
 'nearest_2_street',
 'nearest_2_district',
 'nearest_1_ward']

In [41]:
num_cols

['distance_hn_TranHungDao',
 'num_of_townhallcommunity_centre_in_500m_radius',
 'PC1',
 'num_of_restaurant_in_500m_radius',
 'num_of_kindergarten_in_500m_radius',
 'nearest_5_lat',
 'distance_nearest_6',
 'acreage_ratio_with_meanLandSize',
 'num_of_university_in_500m_radius',
 'distance_hn_CongviennuocDamSen',
 'acreage',
 'distance_hn_DuongNguyenThiDinh',
 'num_of_police_in_500m_radius',
 'distance_hn_DuongLeLoiquan1TPHCM',
 'nearest_8_lat',
 'num_of_townhallcommunity_centre_in_2000m_radius',
 'nearest_6_lat',
 'distance_hn_Congvien304',
 'num_of_restaurant_in_2000m_radius',
 'distance_nearest_7',
 'num_of_parking_in_500m_radius',
 'distance_hn_CongvienHoangVanThu',
 'distance_nearest_5',
 'PC2',
 'num_of_fuel_in_2000m_radius',
 'distance_hn_VincomDongKhoi',
 'distance_hn_CrescentMall',
 'num_of_atm_in_2000m_radius',
 'num_of_parking_in_2000m_radius',
 'facade',
 'distance_hn_CongviencaKoiRinRinPark',
 'num_of_hospital_in_2000m_radius',
 'distance_hn_CongvienGiaDinh',
 'nearest_6_lon'

In [37]:
df = df.reset_index(drop = True)
df.to_csv('../data/process_v1/process_data_9_hcm.csv', index = False)